Exemplo n.º 1
0
def distribute_train(args):
    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = False
    strategy.runtime_split_send_recv = True

    ctr_model = CTR()
    inputs = ctr_model.input_data(args)
    avg_cost, auc_var = ctr_model.net(inputs, args)

    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.Adam(args.learning_rate)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        # 初始化及运行参数服务器节点
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        # 初始化工作节点
        fleet.init_worker()

        exe = fluid.Executor(fluid.CPUPlace())
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)
        dataset, file_list = get_dataset(inputs, args)
        for epoch in range(args.epochs):
            # 以文件为粒度进行shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[auc_var],
                                   fetch_info=["Epoch {} auc ".format(epoch)],
                                   print_period=100,
                                   debug=False)
            end_time = time.time()
            logger.info("epoch %d finished, use time=%d\n" %
                        ((epoch), end_time - start_time))

            # 默认使用0号节点保存模型
            if args.save_model and fleet.is_first_worker():
                model_path = os.path.join(str(args.model_path),
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)

        fleet.stop_worker()
        logger.info("Distribute Train Success!")
Exemplo n.º 2
0
    def test(self):
        endpoints = [
            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
            "127.0.0.1:36007"
        ]

        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=endpoints)

        fleet.init(role)
        loss, acc, _ = self.net()

        optimizer = fluid.optimizer.Adagrad(
            learning_rate=fluid.layers.exponential_decay(
                learning_rate=base_lr,
                decay_steps=500,
                decay_rate=0.969,
                staircase=True))

        strategy = StrategyFactory.create_async_strategy()
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)
Exemplo n.º 3
0
    def run_pserver(self, args):
        if args.role.upper() != "PSERVER":
            raise ValueError("args role must be PSERVER")

        role = role_maker.UserDefinedRoleMaker(
            current_id=args.current_id,
            role=role_maker.Role.SERVER,
            worker_num=args.trainers,
            server_endpoints=args.endpoints.split(","))

        fleet.init(role)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = args.sync_mode
        strategy.geo_sgd_mode = args.geo_sgd_mode
        strategy.geo_sgd_need_push_nums = args.geo_sgd_need_push_nums

        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_server()
        fleet.run_server()
Exemplo n.º 4
0
    def test_pserver(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

        fleet.init(role)

        batch_size = 128
        is_sparse = True
        is_distribute = False

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.geo_sgd_mode = True
        strategy.geo_sgd_need_push_nums = 5

        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)

        optimizer = fluid.optimizer.SGD(0.1)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        pserver_startup_program = fleet.startup_program
        pserver_mian_program = fleet.main_program
Exemplo n.º 5
0
 def instance(self, context):
     from paddle.fluid.incubate.fleet.collective import fleet
     from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker
     role = PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     context['fleet'] = fleet
     context['status'] = 'network_pass'
Exemplo n.º 6
0
def init_role():
    # reset the place according to role of parameter server
    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
    paddle_role = role_maker.Role.WORKER
    place = F.CPUPlace()
    if training_role == "PSERVER":
        paddle_role = role_maker.Role.SERVER

    # set the fleet runtime environment according to configure
    ports = os.getenv("PADDLE_PORT", "6174").split(",")
    pserver_ips = os.getenv("PADDLE_PSERVERS").split(",")  # ip,ip...
    eplist = []
    if len(ports) > 1:
        # local debug mode, multi port
        for port in ports:
            eplist.append(':'.join([pserver_ips[0], port]))
    else:
        # distributed mode, multi ip
        for ip in pserver_ips:
            eplist.append(':'.join([ip, ports[0]]))

    pserver_endpoints = eplist  # ip:port,ip:port...
    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    role = role_maker.UserDefinedRoleMaker(current_id=trainer_id,
                                           role=paddle_role,
                                           worker_num=worker_num,
                                           server_endpoints=pserver_endpoints)
    fleet.init(role)
Exemplo n.º 7
0
 def run_trainer(self, args):
     """
     run trainer process, you don't need to implement it.
     Args:
         args (ArgumentParser): run args to config dist fleet.
     """
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
     if args.role.upper() != "TRAINER":
         raise ValueError("args role must be TRAINER")
     role = role_maker.UserDefinedRoleMaker(
         current_id=args.current_id,
         role=role_maker.Role.WORKER,
         worker_num=args.trainers,
         server_endpoints=args.endpoints.split(","))
     fleet.init(role)
     self._set_strategy(args)
     avg_cost = self.net(args)
     optimizer = fluid.optimizer.SGD(LEARNING_RATE)
     optimizer = fleet.distributed_optimizer(optimizer, self.strategy)
     optimizer.minimize(avg_cost)
     if args.run_params.get("run_from_dataset", False):
         losses = self.do_training_from_dataset(fleet, args)
     else:
         losses = self.do_training(fleet, args)
     losses = "" if not losses else losses
     print(losses)
Exemplo n.º 8
0
 def test_fleet_barrier(self):
     role = role_maker.UserDefinedRoleMaker(current_id=0,
                                            role=role_maker.Role.WORKER,
                                            worker_num=1,
                                            server_endpoints=['127.0.0.1'])
     fleet.init(role)
     check_all_trainers_ready("/ready_path/", 0)
Exemplo n.º 9
0
 def instance(self, context):
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
     from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker
     role = PaddleCloudRoleMaker()
     fleet.init(role)
     context['fleet'] = fleet
     context['status'] = 'network_pass'
Exemplo n.º 10
0
    def run_pserver(self, args):
        fleet.init(self.build_role(args))
        strategy = self.build_strategy(args)
        avg_cost = self.net(args)
        self.build_optimizer(avg_cost, strategy)

        fleet.init_server()
        fleet.run_server()
Exemplo n.º 11
0
    def append_additional_args(self, FLAGS):
        """
        append addtional args from the existing args
        """
        #dataset_dir and train_dir is defined in padllecloud, cannot be set by user
        role = role_maker.PaddleCloudRoleMaker() 
        fleet.init(role)

        return super(PaddleCloudFleetTrainer, self).append_additional_args(FLAGS)
Exemplo n.º 12
0
    def run_pserver(self, role, strategy):
        fleet.init(role)
        avg_cost, x, y = self.net()
        optimizer = fluid.optimizer.SGD(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_server()
        fleet.run_server()
Exemplo n.º 13
0
    def test_default_strategy(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
        fleet.init(role)

        optimizer = fluid.optimizer.SGD(0.0001)
        optimizer = fleet.distributed_optimizer(optimizer)
Exemplo n.º 14
0
def main(args):
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    model = Metapath2vecModel(config=args)
    pyreader = model.pyreader
    loss = model.forward()

    # init fleet
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    train_steps = math.ceil(args.num_nodes * args.epochs / args.batch_size /
                            num_devices / worker_num)
    log.info("Train step: %s" % train_steps)

    real_batch_size = args.batch_size * args.walk_len * args.win_size
    if args.optimizer == "sgd":
        args.lr *= real_batch_size
    optimization(args.lr, loss, train_steps, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server(args.warm_start_from_dir)
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")

        exe = F.Executor(F.CPUPlace())
        exe.run(fleet.startup_program)
        log.info("Startup done")

        dataset = m2vGraph(args)
        log.info("Build graph done.")

        data_generator = multiprocess_data_generator(args, dataset)

        cur_time = time.time()
        for idx, _ in enumerate(data_generator()):
            log.info("iter %s: %s s" % (idx, time.time() - cur_time))
            cur_time = time.time()
            if idx == 100:
                break

        pyreader.decorate_tensor_provider(data_generator)
        pyreader.start()

        compiled_prog = build_complied_prog(fleet.main_program, loss)
        train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
Exemplo n.º 15
0
    def test_dist_geo_server_transpiler(self):
        num_voc = 128
        embed_dim = 64
        x_shape, x_lod = [16, 10], [[3, 5, 2, 6]]
        x = fluid.data(name='x', shape=x_shape, dtype='int32', lod_level=1)
        hash_embd = fluid.contrib.layers.search_pyramid_hash(
            input=x,
            num_emb=embed_dim,
            space_len=num_voc * embed_dim,
            pyramid_layer=4,
            rand_len=16,
            drop_out_percent=0.5,
            is_training=True,
            use_filter=False,
            white_list_len=6400,
            black_list_len=2800,
            seed=3,
            lr=0.002,
            param_attr=fluid.ParamAttr(
                name="PyramidHash_emb_0",
                learning_rate=0,
            ),
            param_attr_wl=fluid.ParamAttr(
                name="Filter",
                learning_rate=0,
            ),
            param_attr_bl=None,
            distribute_update_vars=["PyramidHash_emb_0"],
            name=None)

        cost = fluid.layers.reduce_sum(hash_embd)

        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

        fleet.init(role)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.geo_sgd_mode = True
        strategy.geo_sgd_need_push_nums = 5

        optimizer = fluid.optimizer.SGD(0.1)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(cost)

        pserver_startup_program = fleet.startup_program
        pserver_mian_program = fleet.main_program
Exemplo n.º 16
0
    def test_half_async_strategy(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
        fleet.init(role)

        half_async_config = DistributeTranspilerConfig()

        half_async_config.sync_mode = False
        half_async_config.geo_sgd_mode = False
        half_async_config.runtime_split_send_recv = False

        optimizer = fluid.optimizer.SGD(0.0001)
        optimizer = fleet.distributed_optimizer(optimizer, half_async_config)
Exemplo n.º 17
0
    def test(self):
        endpoints = [
            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
            "127.0.0.1:36007"
        ]

        role = role_maker.UserDefinedRoleMaker(current_id=0,
                                               role=role_maker.Role.SERVER,
                                               worker_num=2,
                                               server_endpoints=endpoints)

        fleet.init(role)
        loss, acc, _ = self.net()
        optimizer = fluid.optimizer.SGD(base_lr)
        strategy = StrategyFactory.create_geo_strategy(20)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)
Exemplo n.º 18
0
    def processor_register(self):
        role = PaddleCloudRoleMaker()
        fleet.init(role)

        if fleet.is_server():
            self.regist_context_processor('uninit', self.instance)
            self.regist_context_processor('init_pass', self.init)
            self.regist_context_processor('server_pass', self.server)
        else:
            self.regist_context_processor('uninit', self.instance)
            self.regist_context_processor('init_pass', self.init)

            if envs.get_platform() == "LINUX":
                self.regist_context_processor('train_pass', self.dataset_train)
            else:
                self.regist_context_processor('train_pass', self.dataloader_train)
            self.regist_context_processor('terminal_pass', self.terminal)
Exemplo n.º 19
0
    def test_communicator_async(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)
        strategy = StrategyFactory.create_async_strategy()
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
Exemplo n.º 20
0
    def processor_register(self):
        role = PaddleCloudRoleMaker()
        fleet.init(role)

        if fleet.is_server():
            self.regist_context_processor('uninit', self.instance)
            self.regist_context_processor('init_pass', self.init)
            self.regist_context_processor('server_pass', self.server)
        else:
            self.regist_context_processor('uninit', self.instance)
            self.regist_context_processor('init_pass', self.init)
            self.regist_context_processor('startup_pass', self.startup)
            if envs.get_platform() == "LINUX" and envs.get_global_env(
                    "dataset_class", None, "train.reader") != "DataLoader":
                self.regist_context_processor('train_pass', self.dataset_train)
            else:
                self.regist_context_processor('train_pass',
                                              self.dataloader_train)
            self.regist_context_processor('infer_pass', self.infer)
            self.regist_context_processor('terminal_pass', self.terminal)
Exemplo n.º 21
0
    def test_transpile(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])
        # for test optimizer without init(role)
        fleet.init(role)
        batch_size = 128
        is_sparse = True
        is_distribute = False

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.runtime_split_send_recv = True
        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)

        self.set_program(avg_cost, strategy)
        strategy.runtime_split_send_recv = False
        self.set_program(avg_cost, strategy)
Exemplo n.º 22
0
    def run_trainer(self, role, strategy):
        place = fluid.core.CPUPlace()
        exe = fluid.Executor(place)

        fleet.init(role)
        avg_cost, x, y = self.net()
        optimizer = fluid.optimizer.SGD(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        exe.run(fleet.startup_program)
        fleet.init_worker()

        train_reader = paddle.batch(self.fake_reader(), batch_size=24)
        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])

        for batch_id, data in enumerate(train_reader()):
            exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[])

        fleet.stop_worker()
Exemplo n.º 23
0
 def run_nccl_trainer(self, args):
     """run fleet api"""
     assert args.update_method == "nccl"
     import paddle.fluid as fluid
     import six
     from paddle.fluid.incubate.fleet.collective import fleet
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = args.run_params['num_threads']
     #dist_strategy = DistributedStrategy()
     #dist_strategy.exec_strategy = exec_strategy
     #dist_strategy.fuse_memory_size = 1  # MB
     #dist_strategy.fuse_laryer_size = 1
     if args.role.upper() != "TRAINER":
         raise ValueError("args role must be TRAINER")
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     strategy = DistributeTranspilerConfig()
     avg_cost = self.net(args)
     losses = self.do_training(fleet,args)
     losses = "" if not losses else losses
     print(losses)
Exemplo n.º 24
0
    def test_communicator_async(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.runtime_split_send_recv = True
        strategy.wait_port = False
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
Exemplo n.º 25
0
 def run_nccl_trainer(self, args):
     """
     run nccl trainer, used for gpu case.
     Args:
         args (ArgumentParser): run args to config dist fleet.
     """
     assert args.update_method == "nccl"
     from paddle.fluid.incubate.fleet.collective import fleet
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = args.run_params['num_threads']
     #dist_strategy = DistributedStrategy()
     #dist_strategy.exec_strategy = exec_strategy
     #dist_strategy.fuse_memory_size = 1  # MB
     #dist_strategy.fuse_laryer_size = 1
     if args.role.upper() != "TRAINER":
         raise ValueError("args role must be TRAINER")
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     avg_cost = self.net(args)
     losses = self.do_training(fleet, args)
     losses = "" if not losses else losses
     print(losses)
Exemplo n.º 26
0
 def run_pserver(self, args):
     """
     run pserver process, you don't need to implement it.
     Args:
         args (ArgumentParser): run args to config dist fleet.
     """
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
     if args.role.upper() != "PSERVER":
         raise ValueError("args role must be PSERVER")
     role = role_maker.UserDefinedRoleMaker(
         current_id=args.current_id,
         role=role_maker.Role.SERVER,
         worker_num=args.trainers,
         server_endpoints=args.endpoints.split(","))
     fleet.init(role)
     self._set_strategy(args)
     avg_cost = self.net(args)
     optimizer = fluid.optimizer.SGD(LEARNING_RATE)
     optimizer = fleet.distributed_optimizer(optimizer, self.strategy)
     optimizer.minimize(avg_cost)
     fleet.init_server(model_dir=args.run_params.get("model_dir", ""))
     fleet.run_server()
Exemplo n.º 27
0
 def run_trainer(self, args):
     """run trainer"""
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
     import paddle.fluid as fluid
     from paddle.fluid.transpiler.ps_dispatcher import RoundRobin
     from paddle.fluid.transpiler.ps_dispatcher import HashName
     fluid.default_startup_program().random_seed = 1
     fluid.default_main_program().random_seed = 1
     if args.role.upper() != "TRAINER":
         raise ValueError("args role must be TRAINER")
     role = role_maker.UserDefinedRoleMaker(
         current_id=args.current_id,
         role=role_maker.Role.WORKER,
         worker_num=args.trainers,
         server_endpoints=args.endpoints.split(","))
     fleet.init(role)
     strategy = DistributeTranspilerConfig()
     strategy.sync_mode = args.run_params["sync_mode"]
     strategy.async_mode = args.run_params["async_mode"]
     strategy.mode = "pserver"
     strategy.slice_var_up = args.run_params['slice_var_up']
     strategy.enable_dc_asgd = args.run_params['enable_dc_asgd']
     if args.run_params['split_method']:
         strategy.split_method = HashName
     strategy.split_method = RoundRobin
     strategy.wait_port = args.run_params['wait_port']
     strategy.runtime_split_send_recv = args.run_params['runtime_split_send_recv']
     strategy.use_hierarchical_allreduce = args.run_params['use_hierarchical_allreduce']
    # strategy.hierarchical_allreduce_exter_nranks = args.run_params['hierarchical_allreduce_exter_nranks']
    # strategy.hierarchical_allreduce_inter_nranks = args.run_params['hierarchical_allreduce_inter_nranks']
     strategy.geo_sgd_mode = args.run_params['geo_sgd']
     strategy.geo_sgd_need_push_nums = args.run_params['push_nums']
     avg_cost = self.net()
     optimizer = fluid.optimizer.SGD(LEARNING_RATE)
     optimizer = fleet.distributed_optimizer(optimizer, strategy)
     optimizer.minimize(avg_cost)
     losses = self.do_training(fleet, args)
     losses = "" if not losses else losses
     print(losses)
Exemplo n.º 28
0
    def test_debug_info(self):
        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
        y_predict = fluid.layers.fc(input=x, size=1, act=None)
        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
        avg_cost = fluid.layers.mean(cost)

        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
        fleet.init(role)

        optimizer = fluid.optimizer.SGD(0.0001)
        strategy = StrategyFactory.create_sync_strategy()
        strategy.set_debug_opt({
            "dump_param": ["fc_0.tmp_0"],
            "dump_fields": ["fc_0.tmp_0", "fc_0.tmp_0@GRAD"],
            "dump_fields_path": "dump_text/"
        })
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
Exemplo n.º 29
0
    def test_communicator_init_and_start(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = True
        strategy.wait_port = False
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        comm = Communicator(fleet.main_program)
        comm.start()
        time.sleep(10)
        comm.stop()
Exemplo n.º 30
0
    def __init__(self):
        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
        paddle_role = role_maker.Role.WORKER
        place = F.CPUPlace()
        if training_role == "PSERVER":
            paddle_role = role_maker.Role.SERVER

        # set the fleet runtime environment according to configure
        port = os.getenv("PADDLE_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = eplist  # ip:port,ip:port...
        worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        role = role_maker.UserDefinedRoleMaker(
            current_id=trainer_id,
            role=paddle_role,
            worker_num=worker_num,
            server_endpoints=pserver_endpoints)
        tfleet.init(role)
        tfleet.save_on_pserver = True