예제 #1
0
    def test_match_faulttolerance(self):
        class Argument:
            elastic_server = "127.0.0.1:2379"
            job_id = "test_job_id_123"
            np = "2"
            gpus = "0"
            nproc_per_node = 1
            host = None
            curr_host = None
            ips = None
            scale = None
            force = None
            backend = 'gloo'

        args = Argument()
        args.ips = "10.10.10.1,10.10.10.2"
        elastic = ElasticManager(args, self.etcd_client)
        os.environ['FLAGS_START_PORT'] = "6001"

        hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
        os.environ[
            'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"

        self.assertEqual(elastic._match(hosts), True)

        hosts = ["10.10.10.1:6001"]
        args.ips = "10.10.10.1"
        os.environ['PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001"
        self.assertEqual(elastic._match(hosts), False)
예제 #2
0
    def test_exit(self):
        class Argument:
            elastic_server = "127.0.0.1:2379"
            job_id = "test_job_id_123"
            np = "2"
            gpus = "0"
            nproc_per_node = 1
            host = None
            curr_host = None
            ips = None
            scale = None
            force = None
            backend = 'gloo'

        args = Argument()
        elastic = ElasticManager(args, self.etcd_client)
        elastic.exit()
예제 #3
0
    def test_match_elastic(self):
        class Argument:
            elastic_server = "127.0.0.1:2379"
            job_id = "test_job_id_123"
            np = "2:4"
            gpus = "0"
            nproc_per_node = 1
            host = None
            curr_host = None
            ips = None
            scale = None
            force = None
            backend = 'gloo'

        os.environ['PADDLE_ELASTIC_TIMEOUT'] = "60"
        args = Argument()
        args.ips = "10.10.10.1,10.10.10.2,10.10.10.3,10.10.10.4"
        os.environ['FLAGS_START_PORT'] = "6001"
        os.environ[
            'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001,10.10.10.3:6001,10.10.10.4:6001"
        os.environ[
            'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001,10.10.10.3:6001,10.10.10.4:6001"
        elastic = ElasticManager(args, self.etcd_client)
        hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
        self.assertEqual(elastic._match(hosts), False)

        hosts = [
            "10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001",
            "10.10.10.4:6001"
        ]
        self.assertEqual(elastic._match(hosts), True)

        hosts = ["10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"]
        self.assertEqual(elastic._match(hosts), False)

        hosts = ["10.10.10.1:6001"]
        self.assertEqual(elastic._match(hosts), False)

        args.ips = "10.10.10.1,10.10.10.2"
        os.environ[
            'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
        os.environ[
            'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
        elastic = ElasticManager(args, self.etcd_client)
        hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
        self.assertEqual(elastic._match(hosts), True)
예제 #4
0
    def test_watch(self):
        class Argument:
            elastic_server = "127.0.0.1:2379"
            job_id = "test_job_id_123"
            np = "2"
            gpus = "0"
            nproc_per_node = 1
            host = None
            curr_host = None
            ips = None
            scale = None
            force = None
            backend = 'gloo'
            elastic_pre_hook = None

        class ElasticLauncher:
            def watch(self):
                return ELASTIC_AUTO_PARALLEL_EXIT_CODE

            def stop(self):
                pass

        args = Argument()
        elastic = ElasticManager(args, self.etcd_client)
        elastic.stopped = False
        elastic.launcher = ElasticLauncher()
        elastic.watch()
예제 #5
0
    def test_elastic_manager_init(self):
        class Argument:
            elastic_server = "127.0.0.1:2379"
            job_id = "test_job_id_123"
            np = "2"
            gpus = "0"
            nproc_per_node = 1
            host = None
            curr_host = None
            ips = None
            scale = None
            force = None
            backend = 'gloo'

        args = Argument()

        class _MockLease():
            def refresh(self):
                raise ValueError("valid error, this only for unittest")

        etcd_client = MockEtcdClient(lease=_MockLease())
        elastic = ElasticManager(args, etcd_client=etcd_client)
예제 #6
0
    def test_update_hosts_for_elastic(self):
        #######################
        #  elastic, scale up  #
        #######################
        class Argument:
            elastic_server = "127.0.0.1:2379"
            job_id = "test_job_id_123"
            np = "2:4"
            gpus = "0"
            nproc_per_node = 1
            host = None
            curr_host = None
            ips = None
            scale = None
            force = None
            backend = 'gloo'

        args = Argument()

        os.environ['FLAGS_START_PORT'] = "6001"
        os.environ['PADDLE_TRAINERS'] = "10.10.10.1,10.10.10.2"
        os.environ[
            'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
        os.environ[
            'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
        elastic = ElasticManager(args, self.etcd_client)
        # add 10.10.10.3:6001
        elastic.curr_host = "10.10.10.1:6001"
        elastic.hosts = [
            "10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"
        ]
        elastic._update_hosts()
        #self.assertEqual(elastic.all_host_endpoints,
        #                 ["10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"])
        self.assertEqual(os.getenv('PADDLE_TRAINERS'),
                         "10.10.10.1,10.10.10.2,10.10.10.3")

        #######################
        # elastic, scale in #
        #######################
        os.environ[
            'PADDLE_TRAINERS'] = "10.10.10.0,10.10.10.1,10.10.10.2,10.10.10.3"
        os.environ[
            'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.0:6000,10.10.10.1:6001,10.10.10.2:6001,10.10.10.3:6001"
        os.environ[
            'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.0:6000,10.10.10.1:6001,10.10.10.2:6001,10.10.10.3:6001"
        os.environ['POD_IP'] = "10.10.10.1"
        os.environ['TRAINER_PORTS_NUM'] = "4"
        os.environ['PADDLE_TRAINER_ID'] = "1"
        os.environ['PADDLE_PORT'] = "6001"
        args = Argument()
        elastic = ElasticManager(args, self.etcd_client)
        # remove 10.10.10.1:6001
        elastic.curr_host = "10.10.10.1:6001"
        elastic.hosts = [
            "10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"
        ]
        elastic._update_hosts()
        #self.assertEqual(elastic.all_host_endpoints,
        #                 ["10.10.10.3:6001", "10.10.10.1:6001", "10.10.10.2:6001"])
        self.assertEqual(os.getenv('PADDLE_TRAINERS'),
                         "10.10.10.3,10.10.10.1,10.10.10.2")
        self.assertEqual(os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS'),
                         "10.10.10.3:6001,10.10.10.1:6001,10.10.10.2:6001")

        ############
        os.environ[
            'PADDLE_TRAINERS'] = "10.10.10.1,10.10.10.1,10.10.10.1,10.10.10.1"
        os.environ[
            'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.1:6002,10.10.10.1:6003,10.10.10.1:6004"
        os.environ[
            'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.1:6002,10.10.10.1:6003,10.10.10.1:6004"
        os.environ['POD_IP'] = "10.10.10.1"
        os.environ['TRAINER_PORTS_NUM'] = "4"
        os.environ['PADDLE_PORT'] = "6001"
        args = Argument()
        elastic = ElasticManager(args, self.etcd_client)
        # remove 10.10.10.1:6001
        elastic.curr_host = "10.10.10.1:6001"
        os.environ['PADDLE_TRAINER_ID'] = "-1"
        elastic.hosts = ["10.10.10.1:6001", "10.10.10.1:6003"]
        elastic._update_hosts()
        #self.assertEqual(elastic.all_host_endpoints,
        #                 ["10.10.10.1:6001", "10.10.10.1:6001"])
        self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.1")
        self.assertEqual(os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS'),
                         "10.10.10.1:6001,10.10.10.1:6003")
예제 #7
0
    def test_update_hosts_for_faulttolerance(self):
        class Argument:
            elastic_server = "127.0.0.1:2379"
            job_id = "test_job_id_123"
            np = "0"
            gpus = "0"
            nproc_per_node = 1
            host = None
            curr_host = None
            ips = None
            scale = None
            force = None
            backend = 'gloo'

        args = Argument()
        os.environ['FLAGS_START_PORT'] = "6001"
        os.environ['PADDLE_ELASTIC_NP'] = "2"
        os.environ['PADDLE_TRAINERS'] = "10.10.10.1,10.10.10.2"
        os.environ[
            'DISTRIBUTED_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
        os.environ[
            'PADDLE_TRAINER_ENDPOINTS'] = "10.10.10.1:6001,10.10.10.2:6001"
        elastic = ElasticManager(args, self.etcd_client)
        # add 10.10.10.3:6001
        os.environ['PADDLE_TRAINER_ID'] = "0"
        elastic.curr_host = "10.10.10.1:6001"
        elastic.hosts = ["10.10.10.1:6001", "10.10.10.2:6001"]
        elastic._update_hosts()
        self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.2")

        # add 10.10.10.3:6001
        elastic.curr_host = "10.10.10.3:6001"
        elastic.hosts = ["10.10.10.1:6001", "10.10.10.3:6001"]
        os.environ['PADDLE_TRAINER_ID'] = "1"
        elastic._update_hosts()
        self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.3")

        elastic.curr_host = "10.10.10.3:6001"
        elastic.hosts = ["10.10.10.1:6001", "10.10.10.3:6001"]
        os.environ['PADDLE_TRAINER_ID'] = "-1"
        elastic._update_hosts()
        self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.3")