Exemplo n.º 1
0
    def test_pserver(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

        fleet.init(role)

        batch_size = 128
        is_sparse = True
        is_distribute = False

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.geo_sgd_mode = True
        strategy.geo_sgd_need_push_nums = 5

        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)

        optimizer = fluid.optimizer.SGD(0.1)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        pserver_startup_program = fleet.startup_program
        pserver_mian_program = fleet.main_program
Exemplo n.º 2
0
def distribute_train(args):
    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = False
    strategy.runtime_split_send_recv = True

    ctr_model = CTR()
    inputs = ctr_model.input_data(args)
    avg_cost, auc_var = ctr_model.net(inputs, args)

    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.Adam(args.learning_rate)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        # 初始化及运行参数服务器节点
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        # 初始化工作节点
        fleet.init_worker()

        exe = fluid.Executor(fluid.CPUPlace())
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)
        dataset, file_list = get_dataset(inputs, args)
        for epoch in range(args.epochs):
            # 以文件为粒度进行shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[auc_var],
                                   fetch_info=["Epoch {} auc ".format(epoch)],
                                   print_period=100,
                                   debug=False)
            end_time = time.time()
            logger.info("epoch %d finished, use time=%d\n" %
                        ((epoch), end_time - start_time))

            # 默认使用0号节点保存模型
            if args.save_model and fleet.is_first_worker():
                model_path = os.path.join(str(args.model_path),
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)

        fleet.stop_worker()
        logger.info("Distribute Train Success!")
Exemplo n.º 3
0
    def test_sync_strategy(self):
        os.environ['CPU_NUM'] = "2"
        strategy = StrategyFactory.create_sync_strategy()
        self.assertEqual(strategy._program_config.sync_mode, False)
        self.assertEqual(strategy._program_config.runtime_split_send_recv,
                         True)
        self.assertEqual(strategy._build_strategy.async_mode, True)
        self.assertEqual(strategy._execute_strategy.num_threads, 2)

        # test set_program_config using DistributeTranspilerConfig()
        program_config_class = DistributeTranspilerConfig()
        program_config_class.min_block_size = 81920
        strategy.set_program_config(program_config_class)
        program_config = strategy.get_program_config()
        self.assertEqual(program_config.min_block_size, 81920)

        # test set_program_config using dict
        program_config_dict = dict()
        program_config_dict['min_block_size'] = 8192
        strategy.set_program_config(program_config_dict)
        program_config = strategy.get_program_config()
        self.assertEqual(program_config.min_block_size, 8192)

        # test set_program_config exception
        program_config_dict['unknown'] = None
        self.assertRaises(Exception, strategy.set_program_config,
                          program_config_dict)
        program_config_illegal = None
        self.assertRaises(Exception, strategy.set_program_config,
                          program_config_illegal)
Exemplo n.º 4
0
    def run_pserver(self, args):
        if args.role.upper() != "PSERVER":
            raise ValueError("args role must be PSERVER")

        role = role_maker.UserDefinedRoleMaker(
            current_id=args.current_id,
            role=role_maker.Role.SERVER,
            worker_num=args.trainers,
            server_endpoints=args.endpoints.split(","))

        fleet.init(role)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = args.sync_mode
        strategy.geo_sgd_mode = args.geo_sgd_mode
        strategy.geo_sgd_need_push_nums = args.geo_sgd_need_push_nums

        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_server()
        fleet.run_server()
Exemplo n.º 5
0
    def set_optimizer(self, FLAGS, net_output):
        """
        set optimizer
        """
        optimizer = net_output['optimizer']
        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = (FLAGS.data_reader != "dataset")
        #pslib, strategy = {"use_cvm": True}
 
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        return optimizer.minimize(net_output['loss'])
Exemplo n.º 6
0
    def test_dist_geo_server_transpiler(self):
        num_voc = 128
        embed_dim = 64
        x_shape, x_lod = [16, 10], [[3, 5, 2, 6]]
        x = fluid.data(name='x', shape=x_shape, dtype='int32', lod_level=1)
        hash_embd = fluid.contrib.layers.search_pyramid_hash(
            input=x,
            num_emb=embed_dim,
            space_len=num_voc * embed_dim,
            pyramid_layer=4,
            rand_len=16,
            drop_out_percent=0.5,
            is_training=True,
            use_filter=False,
            white_list_len=6400,
            black_list_len=2800,
            seed=3,
            lr=0.002,
            param_attr=fluid.ParamAttr(
                name="PyramidHash_emb_0",
                learning_rate=0,
            ),
            param_attr_wl=fluid.ParamAttr(
                name="Filter",
                learning_rate=0,
            ),
            param_attr_bl=None,
            distribute_update_vars=["PyramidHash_emb_0"],
            name=None)

        cost = fluid.layers.reduce_sum(hash_embd)

        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

        fleet.init(role)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.geo_sgd_mode = True
        strategy.geo_sgd_need_push_nums = 5

        optimizer = fluid.optimizer.SGD(0.1)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(cost)

        pserver_startup_program = fleet.startup_program
        pserver_mian_program = fleet.main_program
Exemplo n.º 7
0
    def __init__(self, config=None):
        if config is not None:
            self.config = config
        else:
            self.config = DistributeTranspilerConfig()

        if self.config.split_method is None:
            self.config.split_method = RoundRobin

        global PRINT_LOG
        if self.config.print_log:
            PRINT_LOG = True
        assert (self.config.min_block_size >= 8192)
        assert (self.config.split_method.__bases__[0] == PSDispatcher)
Exemplo n.º 8
0
 def optimize(self, loss, optimizer_type, lr):
     strategy = DistributeTranspilerConfig()
     strategy.sync_mode = False
     log.info('learning rate:%f' % lr)
     if optimizer_type == "sgd":
         optimizer = F.optimizer.SGD(learning_rate=lr)
     elif optimizer_type == "adam":
         # Don't slice tensor ensure convergence
         optimizer = F.optimizer.Adam(learning_rate=lr, lazy_mode=True)
     else:
         raise ValueError("Unknown Optimizer %s" % optimizer_type)
     #create the DistributeTranspiler configure
     optimizer = tfleet.distributed_optimizer(optimizer, strategy)
     optimizer.minimize(loss)
Exemplo n.º 9
0
 def __init__(self, dist_config: list):
     print(dist_config)
     from paddle.fluid import ExecutionStrategy
     from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
     self._strategy = DistributeTranspilerConfig()
     for conf in dist_config:
         conf.setup(self._strategy)
Exemplo n.º 10
0
    def test_half_async_strategy(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
        fleet.init(role)

        half_async_config = DistributeTranspilerConfig()

        half_async_config.sync_mode = False
        half_async_config.geo_sgd_mode = False
        half_async_config.runtime_split_send_recv = False

        optimizer = fluid.optimizer.SGD(0.0001)
        optimizer = fleet.distributed_optimizer(optimizer, half_async_config)
Exemplo n.º 11
0
def optimization(base_lr, loss, optimizer='adam'):
    if optimizer == 'sgd':
        optimizer = F.optimizer.SGD(base_lr)
    elif optimizer == 'adam':
        optimizer = F.optimizer.Adam(base_lr, lazy_mode=True)
    else:
        raise ValueError

    log.info('learning rate:%f' % (base_lr))
    #create the DistributeTranspiler configure
    config = DistributeTranspilerConfig()
    config.sync_mode = False
    #config.runtime_split_send_recv = False

    config.slice_var_up = False
    #create the distributed optimizer
    optimizer = fleet.distributed_optimizer(optimizer, config)
    optimizer.minimize(loss)
Exemplo n.º 12
0
    def test_init_role(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])
        # for test optimizer without init(role)
        # fleet.init(role)
        batch_size = 128
        is_sparse = True
        is_distribute = False
        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.geo_sgd_mode = True
        strategy.geo_sgd_need_push_nums = 5
        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)

        self.assertRaises(Exception, self.set_program, avg_cost, strategy)
Exemplo n.º 13
0
    def test_transpile(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])
        # for test optimizer without init(role)
        fleet.init(role)
        batch_size = 128
        is_sparse = True
        is_distribute = False

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.runtime_split_send_recv = True
        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)

        self.set_program(avg_cost, strategy)
        strategy.runtime_split_send_recv = False
        self.set_program(avg_cost, strategy)
Exemplo n.º 14
0
 def testConfig(self):
     config = DistributeTranspilerConfig()
     self.assertRaises(Exception, self.set_sync_mode, config, None)
     self.assertRaises(Exception, self.set_runtime_split_send_recv, config,
                       None)
     self.assertRaises(Exception, self.set_runtime_split_send_recv, config,
                       True)
     self.set_sync_mode(config, False)
     self.assertFalse(config.sync_mode)
     self.set_runtime_split_send_recv(config, True)
     self.assertRaises(Exception, self.set_sync_mode, config, True)
Exemplo n.º 15
0
    def __init__(self, optimizer, strategy=None):
        super(TranspilerOptimizer, self).__init__(optimizer, strategy)

        if strategy:
            if not isinstance(strategy, DistributeTranspilerConfig):
                raise TypeError(
                    "In {} mode, strategy must be an instance of DistributeTranspilerConfig"
                    .format(fleet._mode))
            else:
                self._strategy = strategy
        else:
            self._strategy = DistributeTranspilerConfig()
Exemplo n.º 16
0
    def __init__(self):
        self._program_config = DistributeTranspilerConfig()
        self._trainer_runtime_config = TrainerRuntimeConfig()
        self._server_runtime_config = ServerRuntimeConfig()
        num_threads = int(os.getenv("CPU_NUM", "1"))

        self._execute_strategy = fluid.ExecutionStrategy()
        self._build_strategy = fluid.BuildStrategy()

        self._execute_strategy.num_threads = num_threads
        if num_threads > 1:
            self._build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        self.debug_opt = None
Exemplo n.º 17
0
    def test_sync_strategy(self):
        os.environ['CPU_NUM'] = "2"
        strategy = StrategyFactory.create_sync_strategy()
        self.assertEqual(strategy._program_config.sync_mode, False)
        self.assertEqual(strategy._program_config.runtime_split_send_recv,
                         True)
        self.assertEqual(strategy._build_strategy.async_mode, True)
        self.assertEqual(strategy._execute_strategy.num_threads, 2)

        # test set_program_config using DistributeTranspilerConfig()
        program_config_class = DistributeTranspilerConfig()
        program_config_class.min_block_size = 81920
        strategy.set_program_config(program_config_class)
        program_config = strategy.get_program_config()
        self.assertEqual(program_config.min_block_size, 81920)

        # test set_program_config using dict
        program_config_dict = dict()
        program_config_dict['min_block_size'] = 8192
        strategy.set_program_config(program_config_dict)
        program_config = strategy.get_program_config()
        self.assertEqual(program_config.min_block_size, 8192)

        # test set_program_config exception
        program_config_dict['unknown'] = None
        self.assertRaises(Exception, strategy.set_program_config,
                          program_config_dict)
        program_config_illegal = None
        self.assertRaises(Exception, strategy.set_program_config,
                          program_config_illegal)

        trainer_runtime_config = strategy.get_trainer_runtime_config()
        trainer_runtime_config.runtime_configs[
            'communicator_send_queue_size'] = '50'
        runtime_configs = trainer_runtime_config.get_communicator_flags()
        self.assertIn('communicator_send_queue_size', runtime_configs)
        self.assertNotIn('communicator_independent_recv_thread',
                         runtime_configs)
        self.assertEqual(runtime_configs['communicator_send_queue_size'], '2')
Exemplo n.º 18
0
    def test_communicator_async(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = False
        strategy.runtime_split_send_recv = True
        strategy.wait_port = False
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
Exemplo n.º 19
0
    def test_communicator_init_and_start(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = True
        strategy.wait_port = False
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        comm = Communicator(fleet.main_program)
        comm.start()
        time.sleep(10)
        comm.stop()
Exemplo n.º 20
0
    def run_trainer(self, args):
        if args.role.upper() != "TRAINER":
            raise ValueError("args role must be TRAINER")

        role = role_maker.UserDefinedRoleMaker(
            current_id=args.current_id,
            role=role_maker.Role.WORKER,
            worker_num=args.trainers,
            server_endpoints=args.endpoints.split(","))

        fleet.init(role)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = args.sync_mode

        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        self.do_training(fleet)
        out = self.do_training(fleet)
Exemplo n.º 21
0
def optimization(base_lr, loss, train_steps, optimizer='sgd'):
    decayed_lr = L.learning_rate_scheduler.polynomial_decay(
        learning_rate=base_lr,
        decay_steps=train_steps,
        end_learning_rate=0.0001 * base_lr,
        power=1.0,
        cycle=False)
    if optimizer == 'sgd':
        optimizer = F.optimizer.SGD(decayed_lr)
    elif optimizer == 'adam':
        optimizer = F.optimizer.Adam(decayed_lr, lazy_mode=True)
    else:
        raise ValueError

    log.info('learning rate:%f' % (base_lr))
    #create the DistributeTranspiler configure
    config = DistributeTranspilerConfig()
    config.sync_mode = False
    #config.runtime_split_send_recv = False

    config.slice_var_up = False
    #create the distributed optimizer
    optimizer = fleet.distributed_optimizer(optimizer, config)
    optimizer.minimize(loss)
Exemplo n.º 22
0
 def run_nccl_trainer(self, args):
     """run fleet api"""
     assert args.update_method == "nccl"
     import paddle.fluid as fluid
     import six
     from paddle.fluid.incubate.fleet.collective import fleet
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = args.run_params['num_threads']
     #dist_strategy = DistributedStrategy()
     #dist_strategy.exec_strategy = exec_strategy
     #dist_strategy.fuse_memory_size = 1  # MB
     #dist_strategy.fuse_laryer_size = 1
     if args.role.upper() != "TRAINER":
         raise ValueError("args role must be TRAINER")
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     strategy = DistributeTranspilerConfig()
     avg_cost = self.net(args)
     losses = self.do_training(fleet,args)
     losses = "" if not losses else losses
     print(losses)
Exemplo n.º 23
0
 def run_trainer(self, args):
     """run trainer"""
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
     import paddle.fluid as fluid
     from paddle.fluid.transpiler.ps_dispatcher import RoundRobin
     from paddle.fluid.transpiler.ps_dispatcher import HashName
     fluid.default_startup_program().random_seed = 1
     fluid.default_main_program().random_seed = 1
     if args.role.upper() != "TRAINER":
         raise ValueError("args role must be TRAINER")
     role = role_maker.UserDefinedRoleMaker(
         current_id=args.current_id,
         role=role_maker.Role.WORKER,
         worker_num=args.trainers,
         server_endpoints=args.endpoints.split(","))
     fleet.init(role)
     strategy = DistributeTranspilerConfig()
     strategy.sync_mode = args.run_params["sync_mode"]
     strategy.async_mode = args.run_params["async_mode"]
     strategy.mode = "pserver"
     strategy.slice_var_up = args.run_params['slice_var_up']
     strategy.enable_dc_asgd = args.run_params['enable_dc_asgd']
     if args.run_params['split_method']:
         strategy.split_method = HashName
     strategy.split_method = RoundRobin
     strategy.wait_port = args.run_params['wait_port']
     strategy.runtime_split_send_recv = args.run_params['runtime_split_send_recv']
     strategy.use_hierarchical_allreduce = args.run_params['use_hierarchical_allreduce']
    # strategy.hierarchical_allreduce_exter_nranks = args.run_params['hierarchical_allreduce_exter_nranks']
    # strategy.hierarchical_allreduce_inter_nranks = args.run_params['hierarchical_allreduce_inter_nranks']
     strategy.geo_sgd_mode = args.run_params['geo_sgd']
     strategy.geo_sgd_need_push_nums = args.run_params['push_nums']
     avg_cost = self.net()
     optimizer = fluid.optimizer.SGD(LEARNING_RATE)
     optimizer = fleet.distributed_optimizer(optimizer, strategy)
     optimizer.minimize(avg_cost)
     losses = self.do_training(fleet, args)
     losses = "" if not losses else losses
     print(losses)
Exemplo n.º 24
0
 def _set_strategy(self, args):
     """配置运行的distributed_strategy, 
        build_strategy 配置在do_training中"""
     if int(os.getenv("PADDLE_COMPATIBILITY_CHECK", '0')):
         self.strategy = DistributeTranspilerConfig()
         if args.run_params["sync_mode"] == "sync":
             self.strategy.sync_mode = True
             self.strategy.runtime_split_send_recv = False
             self.async_mode = False
         elif args.run_params["sync_mode"] == "half_async":
             self.strategy.sync_mode = False
             self.async_mode = False
         elif args.run_params["sync_mode"] == "async":
             self.strategy.sync_mode = False
             self.async_mode = True
         elif args.run_params["sync_mode"] == "geo_async":
             self.strategy.sync_mode = False
             self.async_mode = True
             self.strategy.geo_sgd_mode = True
             self.strategy.geo_sgd_need_push_nums = 400
         self.strategy.mode = "pserver"
         self.strategy.slice_var_up = args.run_params['slice_var_up']
         self.strategy.enable_dc_asgd = args.run_params['enable_dc_asgd']
         #TODO: split_method=HashName, it will cause a bug, this option can open after repair
         # if args.run_params['split_method']:
         #    self.strategy.split_method = HashName
         # else:
         #    self.strategy.split_method = RoundRobin
         self.strategy.wait_port = args.run_params['wait_port']
         self.strategy.runtime_split_send_recv = args.run_params[
             'runtime_split_send_recv']
         self.strategy.use_hierarchical_allreduce = args.run_params[
             'use_hierarchical_allreduce']
         self.strategy.geo_sgd_need_push_nums = args.run_params['push_nums']
     else:
         self.strategy = StrategyFactory.create_sync_strategy()
         # trainer_runtime_config = TrainerRuntimeConfig()
         # trainer_runtime_config.send_queue_size = "16"
         # trainer_runtime_config.thread_pool_size="32"
         # trainer_runtime_config.max_merge_var_num="16"
         # trainer_runtime_config.is_sgd_communicator="0"
         if args.run_params["sync_mode"] == "sync":
             self.strategy = StrategyFactory.create_sync_strategy()
         elif args.run_params["sync_mode"] == "half_async":
             self.strategy = StrategyFactory.create_half_async_strategy()
         elif args.run_params["sync_mode"] == "async":
             self.strategy = StrategyFactory.create_async_strategy()
             build_strategy = self.strategy.get_build_strategy()
             build_strategy.memory_optimize = False
             self.strategy.set_build_strategy(build_strategy)
         elif args.run_params["sync_mode"] == "geo_async":
             self.strategy = StrategyFactory.create_geo_strategy(400)
         program_config = self.strategy.get_program_config()
         program_config.slice_var_up = args.run_params['slice_var_up']
         program_config.enable_dc_asgd = args.run_params['enable_dc_asgd']
         #TODO: split_method=HashName, it will cause a bug, this option can open after repair
         # if args.run_params['split_method']:
         #    program_config.split_method = HashName
         # else:
         #    program_config.split_method = RoundRobin
         program_config.wait_port = args.run_params['wait_port']
         program_config.runtime_split_send_recv = args.run_params[
             'runtime_split_send_recv']
         program_config.use_hierarchical_allreduce = args.run_params[
             'use_hierarchical_allreduce']
         program_config.geo_sgd_need_push_nums = args.run_params[
             'push_nums']
Exemplo n.º 25
0
class FleetDistRunnerBase(object):
    """dist fleet case runner base."""
    def __init__(self, batch_num=5, batch_size=32):
        self.batch_num = batch_num
        self.batch_size = batch_size
        self.async_mode = False  # 用于1.6的 BuildStrategy构建,后续可随paddle优化改进或者删掉

    def _set_strategy(self, args):
        """配置运行的distributed_strategy, 
           build_strategy 配置在do_training中"""
        if int(os.getenv("PADDLE_COMPATIBILITY_CHECK", '0')):
            self.strategy = DistributeTranspilerConfig()
            if args.run_params["sync_mode"] == "sync":
                self.strategy.sync_mode = True
                self.strategy.runtime_split_send_recv = False
                self.async_mode = False
            elif args.run_params["sync_mode"] == "half_async":
                self.strategy.sync_mode = False
                self.async_mode = False
            elif args.run_params["sync_mode"] == "async":
                self.strategy.sync_mode = False
                self.async_mode = True
            elif args.run_params["sync_mode"] == "geo_async":
                self.strategy.sync_mode = False
                self.async_mode = True
                self.strategy.geo_sgd_mode = True
                self.strategy.geo_sgd_need_push_nums = 400
            self.strategy.mode = "pserver"
            self.strategy.slice_var_up = args.run_params['slice_var_up']
            self.strategy.enable_dc_asgd = args.run_params['enable_dc_asgd']
            #TODO: split_method=HashName, it will cause a bug, this option can open after repair
            # if args.run_params['split_method']:
            #    self.strategy.split_method = HashName
            # else:
            #    self.strategy.split_method = RoundRobin
            self.strategy.wait_port = args.run_params['wait_port']
            self.strategy.runtime_split_send_recv = args.run_params[
                'runtime_split_send_recv']
            self.strategy.use_hierarchical_allreduce = args.run_params[
                'use_hierarchical_allreduce']
            self.strategy.geo_sgd_need_push_nums = args.run_params['push_nums']
        else:
            self.strategy = StrategyFactory.create_sync_strategy()
            # trainer_runtime_config = TrainerRuntimeConfig()
            # trainer_runtime_config.send_queue_size = "16"
            # trainer_runtime_config.thread_pool_size="32"
            # trainer_runtime_config.max_merge_var_num="16"
            # trainer_runtime_config.is_sgd_communicator="0"
            if args.run_params["sync_mode"] == "sync":
                self.strategy = StrategyFactory.create_sync_strategy()
            elif args.run_params["sync_mode"] == "half_async":
                self.strategy = StrategyFactory.create_half_async_strategy()
            elif args.run_params["sync_mode"] == "async":
                self.strategy = StrategyFactory.create_async_strategy()
                build_strategy = self.strategy.get_build_strategy()
                build_strategy.memory_optimize = False
                self.strategy.set_build_strategy(build_strategy)
            elif args.run_params["sync_mode"] == "geo_async":
                self.strategy = StrategyFactory.create_geo_strategy(400)
            program_config = self.strategy.get_program_config()
            program_config.slice_var_up = args.run_params['slice_var_up']
            program_config.enable_dc_asgd = args.run_params['enable_dc_asgd']
            #TODO: split_method=HashName, it will cause a bug, this option can open after repair
            # if args.run_params['split_method']:
            #    program_config.split_method = HashName
            # else:
            #    program_config.split_method = RoundRobin
            program_config.wait_port = args.run_params['wait_port']
            program_config.runtime_split_send_recv = args.run_params[
                'runtime_split_send_recv']
            program_config.use_hierarchical_allreduce = args.run_params[
                'use_hierarchical_allreduce']
            program_config.geo_sgd_need_push_nums = args.run_params[
                'push_nums']
            # self.strategy.set_trainer_runtime_config(trainer_runtime_config)

    def run_pserver(self, args):
        """
        run pserver process, you don't need to implement it.
        Args:
            args (ArgumentParser): run args to config dist fleet.
        """
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
        if args.role.upper() != "PSERVER":
            raise ValueError("args role must be PSERVER")
        role = role_maker.UserDefinedRoleMaker(
            current_id=args.current_id,
            role=role_maker.Role.SERVER,
            worker_num=args.trainers,
            server_endpoints=args.endpoints.split(","))
        fleet.init(role)
        self._set_strategy(args)
        avg_cost = self.net(args)
        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
        optimizer = fleet.distributed_optimizer(optimizer, self.strategy)
        optimizer.minimize(avg_cost)
        fleet.init_server(model_dir=args.run_params.get("model_dir", ""))
        fleet.run_server()

    def run_trainer(self, args):
        """
        run trainer process, you don't need to implement it.
        Args:
            args (ArgumentParser): run args to config dist fleet.
        """
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
        if args.role.upper() != "TRAINER":
            raise ValueError("args role must be TRAINER")
        role = role_maker.UserDefinedRoleMaker(
            current_id=args.current_id,
            role=role_maker.Role.WORKER,
            worker_num=args.trainers,
            server_endpoints=args.endpoints.split(","))
        fleet.init(role)
        self._set_strategy(args)
        avg_cost = self.net(args)
        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
        optimizer = fleet.distributed_optimizer(optimizer, self.strategy)
        optimizer.minimize(avg_cost)
        if args.run_params.get("run_from_dataset", False):
            losses = self.do_training_from_dataset(fleet, args)
        else:
            losses = self.do_training(fleet, args)
        losses = "" if not losses else losses
        print(losses)

    def run_nccl_trainer(self, args):
        """
        run nccl trainer, used for gpu case.
        Args:
            args (ArgumentParser): run args to config dist fleet.
        """
        assert args.update_method == "nccl"
        from paddle.fluid.incubate.fleet.collective import fleet
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = args.run_params['num_threads']
        #dist_strategy = DistributedStrategy()
        #dist_strategy.exec_strategy = exec_strategy
        #dist_strategy.fuse_memory_size = 1  # MB
        #dist_strategy.fuse_laryer_size = 1
        if args.role.upper() != "TRAINER":
            raise ValueError("args role must be TRAINER")
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        avg_cost = self.net(args)
        losses = self.do_training(fleet, args)
        losses = "" if not losses else losses
        print(losses)

    def net(self, args=None):
        """
        construct model's net. Each model has its own unique network.
        Args:
            args (ArgumentParser): run args to config dist fleet.
        """
        raise NotImplementedError(
            "get_model should be implemented by child classes.")

    def do_training(self, fleet, args=None):
        """
        training from pyreader.
        Args:
            fleet:
            args (ArgumentParser): run args to config dist fleet.
        """
        raise NotImplementedError(
            "do_training should be implemented by child classes.")

    def do_training_from_dataset(self, fleet, args=None):
        """
        training from dataset.
        Args:
            fleet:
            args (ArgumentParser): run args to config dist fleet.
        """
        raise NotImplementedError(
            "do_training should be implemented by child classes.")

    def py_reader(self):
        """use py_reader."""
        raise NotImplementedError(
            "py_reader should be implemented by child classes.")

    def dataset_reader(self):
        """use dataset_reader."""
        raise NotImplementedError(
            "dataset_reader should be implemented by child classes.")
Exemplo n.º 26
0
def train(use_cuda, save_dirname, is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    old_model = None
    model_args = model()
    predict = model_args['predict']
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']
    loader = model_args['loader']
    auc_batch = model_args['auc'][1]

    # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)
    # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        readers = []
        for i in range(16):
            readers.append(data_reader(cluster_train_dir))
        multi_readers = paddle.reader.multiprocess_reader(readers)
        loader.set_sample_generator(
            multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
            # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
        # feeder = fluid.DataFeeder(feed_order, place)
        # train_reader = feeder.decorate_reader(
        #     paddle.batch(paddle.reader.shuffle(
        #         data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
        #          multi_devices=False, drop_last=True)

        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_prog = fluid.default_main_program()

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM * 2
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster
        build_strategy.fuse_broadcast_ops = True
        # build_strategy.async_mode = True
        main_program = fluid.CompiledProgram(main_prog).with_data_parallel(
            loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy)
            #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM))

        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            for auc_state in model_args['auc'][2]:
                set_zero(place, fluid.global_scope(), auc_state.name)

        # 并行训练,速度更快
        # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
        #                                   main_program=main_program, loss_name=avg_cost.name,
        #                                   exec_strategy=exec_strategy, build_strategy=build_strategy)

        cost_list = []
        auc_list = []
        import time
        pass_s_time = time.time()
        for pass_id in range(PASS_NUM):
            s_time = time.time()
            for batch_id, data in enumerate(loader()):
                r_time = time.time() - s_time
                st_time = time.time()
                cost_value, auc_value = exe.run(
                    program=main_program,
                    feed=data,
                    fetch_list=[avg_cost.name, auc_batch.name])
                t_time = time.time() - st_time
                cost_list.append(np.array(cost_value))
                auc_list.append(np.array(auc_value))

                if batch_id % 10 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \
                          (pass_id, batch_id, np.array(cost_list).mean(),
                           np.array(auc_list).mean(), r_time, t_time)
                    cost_list = []
                    auc_list = []
                if batch_id % 1000 == 0:
                    if save_dirname is not None:
                        fluid.io.save_inference_model(
                            save_dirname,
                            feed_order,
                            [predict, avg_cost, auc_batch], exe
                        )
                        fluid.io.save_persistables(exe, save_dirname)
                        infer(cluster_test_dir, save_dirname, feed_order)
                s_time = time.time()
        pass_time = time.time() - pass_s_time
        print("Pass train time: %f" % pass_time)

    else:
        role = role_maker.PaddleCloudRoleMaker()
        # 全异步训练
        config = DistributeTranspilerConfig()
        config.sync_mode = False
        config.runtime_split_send_recv = True
        # 加入 fleet init 初始化环境
        fleet.init(role)

        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = Executor(place)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(
                paddle.batch(paddle.reader.shuffle(
                    data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
                multi_devices=False, drop_last=True)

            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy)

            for pass_id in range(PASS_NUM):
                cost_list = []
                auc_list = []
                import time
                s_time = time.time()
                for batch_id, data in enumerate(train_reader()):
                    r_time = time.time() - s_time
                    cost_value, auc_value = exe.run(
                        program=compiled_prog, feed=data,
                        fetch_list=[avg_cost.name, auc_batch.name])
                    t_time = time.time() - r_time
                    cost_list.append(np.array(cost_value))
                    auc_list.append(np.array(auc_value))

                    if batch_id % 10 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \
                              (pass_id, batch_id, np.array(cost_list).mean(),
                               np.array(auc_list).mean(), r_time, t_time)
                        cost_list = []
                        auc_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if save_dirname is not None:
                            fleet.save_inference_model(
                                exe,
                                save_dirname,
                                feed_order,
                                [predict, avg_cost, auc_batch]
                            )
                            fleet.save_persistables(exe, save_dirname)
                            infer(cluster_test_dir, save_dirname, feed_order)
                    s_time = time.time()
        fleet.stop_worker()
Exemplo n.º 27
0
def train(args):
    datas, avg_cost, predict, train_file_path = model()

    endpoints = args.endpoints.split(",")
    if args.role.upper() == "PSERVER":
        current_id = endpoints.index(args.current_endpoint)
    else:
        current_id = 0
    role = role_maker.UserDefinedRoleMaker(
        current_id=current_id,
        role=role_maker.Role.WORKER
        if args.role.upper() == "TRAINER" else role_maker.Role.SERVER,
        worker_num=args.trainers,
        server_endpoints=endpoints)

    exe = fluid.Executor(fluid.CPUPlace())
    fleet.init(role)

    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = False

    optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)

    if fleet.is_server():
        logger.info("run pserver")

        fleet.init_server()
        fleet.run_server()
    elif fleet.is_worker():
        logger.info("run trainer")

        fleet.init_worker()
        exe.run(fleet.startup_program)

        thread_num = 2
        filelist = []
        for _ in range(thread_num):
            filelist.append(train_file_path)

        # config dataset
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_batch_size(128)
        dataset.set_use_var(datas)
        pipe_command = 'python ctr_dataset_reader.py'
        dataset.set_pipe_command(pipe_command)

        dataset.set_filelist(filelist)
        dataset.set_thread(thread_num)

        for epoch_id in range(10):
            logger.info("epoch {} start".format(epoch_id))
            pass_start = time.time()
            dataset.set_filelist(filelist)
            exe.train_from_dataset(
                program=fleet.main_program,
                dataset=dataset,
                fetch_list=[avg_cost],
                fetch_info=["cost"],
                print_period=100,
                debug=False)
            pass_time = time.time() - pass_start
            logger.info("epoch {} finished, pass_time {}".format(epoch_id,
                                                                 pass_time))
        fleet.stop_worker()
dataset.set_hdfs_config("hdfs://192.168.48.87:9000", "root,")
optimizer = fluid.optimizer.SGD(0.0001)
#optimizer.minimize(avg_cost)
exe = fluid.Executor(fluid.CPUPlace())

input_folder = "hdfs:"
output = sp.check_output(
    "hdfs dfs -ls /train_data | awk '{if(NR>1) print $8}'", shell=True)
train_filelist = [
    "{}{}".format(input_folder, f)
    for f in output.decode('ascii').strip().split('\n')
]
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)

config = DistributeTranspilerConfig()
config.sync_mode = False

optimizer = fleet.distributed_optimizer(optimizer, config)
optimizer.minimize(avg_cost)

if fleet.is_server():
    fleet.init_server()
    fleet.run_server()
elif fleet.is_worker():
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    fleet.init_worker()
    exe.run(fluid.default_startup_program())
    print("startup program done.")
    fleet_filelist = fleet.split_files(train_filelist)
Exemplo n.º 29
0
class FLDistributeTranspiler(object):
    """
    **FlDistributeTranspiler**

    Convert the fluid program to distributed data-parallelism programs.

    In pserver mode, the trainers' main program do forward, backward and optimizaiton.
    pserver's main_program will sum and scale.


    Examples:
        .. code-block:: python

            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
            y_predict = fluid.layers.fc(input=x, size=1, act=None)

            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
            avg_loss = fluid.layers.mean(cost)

            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
            sgd_optimizer.minimize(avg_loss)

            # for pserver mode
            pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
            trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
            current_endpoint = "192.168.0.1:6174"
            trainer_id = 0
            trainers = 4
            role = "PSERVER"
            t = fluid.FlDistributeTranspiler()
            t.transpile(
                 trainer_id, pservers=pserver_endpoints, trainers=trainers)
            if role == "PSERVER":
                 pserver_program = t.get_pserver_program(current_endpoint)
                 pserver_startup_program = t.get_startup_program(current_endpoint,
                                                                pserver_program)
            elif role == "TRAINER":
                 trainer_program = t.get_trainer_program()

    """
    def __init__(self, config=None):
        if config is not None:
            self.config = config
        else:
            self.config = DistributeTranspilerConfig()

        if self.config.split_method is None:
            self.config.split_method = RoundRobin

        global PRINT_LOG
        if self.config.print_log:
            PRINT_LOG = True
        assert (self.config.min_block_size >= 8192)
        assert (self.config.split_method.__bases__[0] == PSDispatcher)

    def _get_all_remote_sparse_update_op(self, main_program):
        sparse_update_ops = []
        sparse_update_op_types = [
            "lookup_table", "nce", "hierarchical_sigmoid"
        ]
        for op in main_program.global_block().ops:
            if op.type in sparse_update_op_types and op.attr(
                    'remote_prefetch') is True:
                sparse_update_ops.append(op)
        return sparse_update_ops

    def transpile(self,
                  trainer_id,
                  program=None,
                  pservers="127.0.0.1:6174",
                  trainers=1,
                  sync_mode=True,
                  startup_program=None,
                  current_endpoint="127.0.0.1:6174"):
        """
        Run the transpiler. Transpile the input program.

        Args:
            trainer_id (int): id for current trainer worker, if you have
                n workers, the id may range from 0 ~ n-1
            program (Program|None): program to transpile,
                default is fluid.default_main_program().
            startup_program (Program|None): startup_program to transpile,
                default is fluid.default_startup_program().
            pservers (str): comma separated ip:port string for the pserver
                list.
            trainers (int|str): in pserver mode this is the number of
                trainers.
            sync_mode (bool): Do sync training or not, default is True.
            startup_program (Program|None): startup_program to transpile,
                default is fluid.default_main_program().
            current_endpoint (str): In pserver mode
                this argument is not used.

        Examples:
            .. code-block:: python

                transpiler = fluid.DistributeTranspiler()
                t.transpile(
                    trainer_id=0,
                    pservers="127.0.0.1:7000,127.0.0.1:7001",
                    trainers=2,
                    sync_mode=False,
                    current_endpoint="127.0.0.1:7000")
        """
        if program is None:
            program = default_main_program()
        if startup_program is None:
            startup_program = default_startup_program()
        self.origin_program = program
        self.startup_program = startup_program
        self.origin_startup_program = self.startup_program.clone()

        self.trainer_num = trainers
        self.sync_mode = sync_mode
        self.trainer_id = trainer_id
        pserver_endpoints = pservers.split(",")
        self.pserver_endpoints = pserver_endpoints
        self.vars_overview = VarsDistributed()
        self.optimize_ops, self.params_grads = self._get_optimize_pass()

        ps_dispatcher = self.config.split_method(self.pserver_endpoints)
        self.table_name = find_distributed_lookup_table(self.origin_program)
        self.has_distributed_lookup_table = self.table_name != None
        self.param_name_to_grad_name = dict()
        self.grad_name_to_param_name = dict()
        for param_var, grad_var in self.params_grads:
            self.param_name_to_grad_name[param_var.name] = grad_var.name
            self.grad_name_to_param_name[grad_var.name] = param_var.name

        # get all sparse update ops
        self.sparse_update_ops = self._get_all_remote_sparse_update_op(
            self.origin_program)
        # use_sparse_update_param_name -> split_height_section
        self.sparse_param_to_height_sections = dict()

        # add distributed attrs to program
        self.origin_program._is_distributed = True
        self.origin_program._endpoints = self.pserver_endpoints
        self.origin_program._ps_endpoint = current_endpoint
        self.origin_program._is_chief = self.trainer_id == 0
        self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None

        # split and create vars, then put splited vars in dicts for later use.
        # step 1: split and create vars, then put splited vars in dicts for later use.
        self._init_splited_vars()

        # step 2: insert send op to send gradient vars to parameter servers
        ps_dispatcher.reset()
        send_vars = []

        # in general cases, the number of pservers is times of 2, and this
        # will lead to uneven distribution among weights and bias:
        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
        # shuffle the map will avoid the uneven distribution above

        self.opti_name_to_send_dummy_out = dict()
        self.recv_program = self.origin_program.clone()
        all_ops = []
        for op in self.recv_program.global_block().ops:
            all_ops.append(op)
        delete_ops(self.recv_program.global_block(), all_ops)

        self.split_num = len(program.global_block().ops)
        for opti_varname in self._opti_var_list:
            opti_var = program.global_block().var(opti_varname)
            eplist = ps_dispatcher.dispatch([opti_var])

            dummy_output = program.global_block().create_var(
                name=framework.generate_control_dev_var_name())
            self.opti_name_to_send_dummy_out[opti_varname] = dummy_output

            program.global_block().append_op(
                type="send",
                inputs={"X": [opti_var]},
                outputs={"Out": dummy_output},
                attrs={
                    "epmap":
                    eplist,
                    RPC_OP_ROLE_ATTR_NAME:
                    RPC_OP_ROLE_ATTR_VALUE,
                    OP_ROLE_VAR_ATTR_NAME:
                    [self._opti_to_param[opti_varname], opti_varname],
                    "sync_mode":
                    not self.sync_mode,
                })
            send_vars.append(opti_var)

        if self.sync_mode:
            send_barrier_out = program.global_block().create_var(
                name=framework.generate_control_dev_var_name())
            input_deps = list(self.opti_name_to_send_dummy_out.values())

            program.global_block().append_op(type="send_barrier",
                                             inputs={"X": list(input_deps)},
                                             outputs={"Out": send_barrier_out},
                                             attrs={
                                                 "endpoints":
                                                 pserver_endpoints,
                                                 "sync_mode":
                                                 self.sync_mode,
                                                 "trainer_id":
                                                 self.trainer_id,
                                                 RPC_OP_ROLE_ATTR_NAME:
                                                 RPC_OP_ROLE_ATTR_VALUE
                                             })

        # step 3: insert recv op to receive parameters from parameter server
        recv_vars = []
        for _, var in enumerate(send_vars):
            recv_vars.append(program.global_block().var(
                self._opti_to_param[var.name]))
        ps_dispatcher.reset()
        eplist = ps_dispatcher.dispatch(recv_vars)
        for i, ep in enumerate(eplist):
            self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
            self.param_grad_ep_mapping[ep]["opti"].append(send_vars[i])

            distributed_var = self.vars_overview.get_distributed_var_by_slice(
                recv_vars[i].name)
            distributed_var.endpoint = ep

        # step4: Concat the parameters splits together after recv.
        all_recv_outputs = []
        for opti_varname in self._opti_var_list:
            opti_var = program.global_block().var(opti_varname)
            param_varname = self._opti_to_param[opti_varname]
            param_var = program.global_block().var(param_varname)
            eps = []
            table_names = []
            index = [v.name for v in recv_vars].index(param_varname)
            eps.append(eplist[index])
            table_names.append(var.name)
            if self.sync_mode:
                recv_dep_in = send_barrier_out
            # get recv op_role_var, if not splited, the grad should have .trainer suffix
            # if splited, grad should be the original grad var name. ParallelExecutor
            # will use op_role_var to get expected device place to run this op.

            all_recv_outputs.extend([param_var])
            self.recv_program.global_block().append_op(
                type="recv",
                inputs={"X": []},
                outputs={"Out": [param_var]},
                attrs={
                    "epmap": eps,
                    "trainer_id": self.trainer_id,
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                    OP_ROLE_VAR_ATTR_NAME: [param_varname, opti_varname],
                    "sync_mode": not self.sync_mode
                })

        if self.sync_mode:
            # form a WAW dependency
            self.recv_program.global_block()._insert_op(
                index=len(self._opti_var_list),
                type="fetch_barrier",
                inputs={},
                outputs={"Out": all_recv_outputs},
                attrs={
                    "endpoints": pserver_endpoints,
                    "trainer_id": self.trainer_id,
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                })

        self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)

        self._get_distributed_optimizer_vars()
        self.origin_program._parameters_on_pservers = self.vars_overview

    def get_trainer_program(self, wait_port=True):
        """
        Get transpiled trainer side program.

        Returns:
            Program: trainer side program.
        """
        # remove optimize ops and add a send op to main_program
        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?

        lr_ops = self._get_lr_ops()

        self.origin_program.__str__()

        self.send_program = self.origin_program.clone()
        compute_ops = self.send_program.global_block().ops[0:self.split_num]
        delete_ops(self.send_program.global_block(), compute_ops)
        send_ops = self.origin_program.global_block().ops[self.split_num:]
        delete_ops(self.origin_program.global_block(), send_ops)

        return self.recv_program, self.origin_program, self.send_program

    def _get_trainer_startup_program(self, recv_vars, eplist):
        """
        Get transpiled trainer side startup program.

        Args:
            recv_vars (list): Variable list to recv for current trainer_id
            eplist (list): A list of strings indicating

        Returns:
            Program: trainer side startup program.
        """
        startup_program = self.startup_program

        # FIXME(gongwb): delete not need ops.
        # note that: some parameter is not trainable and those ops can't be deleted.
        for opti_varname in self._opti_var_list:
            opti_var = self.origin_program.global_block().var(opti_varname)
            param_varname = self._opti_to_param[opti_varname]
            var = self.origin_program.global_block().var(param_varname)

            # Get the eplist of recv vars
            eps = []
            table_names = []
            index = [v.name for v in recv_vars].index(param_varname)
            eps.append(eplist[index])

        return startup_program

    def get_pserver_program(self, endpoint):
        """
        Get parameter server side program.

        Args:
            endpoint (str): current parameter server endpoint.

        Returns:
            Program: the program for current parameter server to run.
        """
        # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
        # NOTE: assume blocks of the same variable is not distributed
        # on the same pserver, only change param/grad varnames for
        # trainers to fetch.
        sys.stderr.write(
            "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n"
        )
        # step1
        pserver_program = Program()
        pserver_program.random_seed = self.origin_program.random_seed
        pserver_program._copy_dist_param_info_from(self.origin_program)

        # step2: Create vars to receive vars at parameter servers.
        recv_inputs = []
        for v in self.param_grad_ep_mapping[endpoint]["params"]:
            self._clone_var(pserver_program.global_block(), v)
        for v in self.param_grad_ep_mapping[endpoint]["opti"]:
            # create vars for each trainer in global scope, so
            # we don't need to create them when grad arrives.
            # change client side var name to origin name by
            # removing ".trainer_%d" suffix
            suff_idx = v.name.find(".opti.trainer_")
            if suff_idx >= 0:
                orig_var_name = v.name[:suff_idx]
            # NOTE: single_trainer_var must be created for multi-trainer
            # case to merge grads from multiple trainers
            single_trainer_var = pserver_program.global_block().var(
                orig_var_name)

            if self.sync_mode and self.trainer_num > 1:
                for trainer_id in range(self.trainer_num):
                    var = pserver_program.global_block().create_var(
                        name="%s.opti.trainer_%d" %
                        (orig_var_name, trainer_id),
                        persistable=False,
                        type=v.type,
                        dtype=v.dtype,
                        shape=v.shape)
                    recv_inputs.append(var)

        # step 3
        # Create a union-find data structure from optimize ops,
        # If two ops are connected, we could add these two ops
        # into one set.
        ufind = self._create_ufind(self.optimize_ops)
        # step 3.2
        # Iterate through the ops and append optimize op which
        # located on current pserver
        opt_op_on_pserver = []
        for _, op in enumerate(self.optimize_ops):
            if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
                    endpoint, op):
                opt_op_on_pserver.append(op)

        # step 3.4
        # Iterate through the ops, and if an op and the optimize ops
        # which located on current pserver are in one set, then
        # append it into the sub program.

        global_ops = []

        # sparse grad name to param name
        sparse_grad_to_param = []

        # append lr decay ops to the child block if exists
        lr_ops = self._get_lr_ops()
        # record optimize blocks and we can run them on pserver parallel
        opti_blocks = []

        # append op to the current block
        grad_to_block_id = []
        pre_block_idx = pserver_program.num_blocks - 1
        for idx, opt_op in enumerate(self._opti_var_list):
            per_opt_block = pserver_program._create_block(pre_block_idx)
            opti_blocks.append(per_opt_block)
            optimize_target_param_name = self._opti_to_param[opt_op]
            pserver_block = per_opt_block.program.global_block()
            # append grad merging ops before clip and weight decay
            # e.g. merge grad -> L2Decay op -> clip op -> optimize
            merged_var = pserver_block.vars[optimize_target_param_name]
            if self.sync_mode and self.trainer_num > 1:
                vars2merge = []
                for i in range(self.trainer_num):
                    per_trainer_name = "%s.opti.trainer_%d" % \
                                       (optimize_target_param_name, i)
                    vars2merge.append(pserver_block.vars[per_trainer_name])
                per_opt_block.append_op(type="sum",
                                        inputs={"X": vars2merge},
                                        outputs={"Out": merged_var},
                                        attrs={"use_mkldnn": False})
                per_opt_block.append_op(
                    type="scale",
                    inputs={"X": merged_var},
                    outputs={"Out": merged_var},
                    attrs={"scale": 1.0 / float(self.trainer_num)})

        # In some case, some parameter server will have no parameter to optimize
        # So we give an empty optimize block to parameter server.
        attrs = {
            "optimize_blocks": opti_blocks,
            "endpoint": endpoint,
            "Fanin": self.trainer_num,
            "sync_mode": self.sync_mode,
        }

        # step5 append the listen_and_serv op
        pserver_program.global_block().append_op(type="fl_listen_and_serv",
                                                 inputs={'X': recv_inputs},
                                                 outputs={},
                                                 attrs=attrs)

        pserver_program._sync_with_cpp()
        # save pserver program to generate pserver side startup relatively.
        self.pserver_program = pserver_program
        return pserver_program

    def get_startup_program(self,
                            endpoint,
                            pserver_program=None,
                            startup_program=None):
        """
        **Deprecated**

        Get startup program for current parameter server.
        Modify operator input variables if there are variables that
        were split to several blocks.

        Args:
            endpoint (str): current pserver endpoint.
            pserver_program (Program): deprecated, call get_pserver_program first.
            startup_program (Program): deprecated, should pass startup_program
                when initalizing

        Returns:
            Program: parameter server side startup program.
        """
        s_prog = Program()
        orig_s_prog = self.startup_program
        s_prog.random_seed = orig_s_prog.random_seed
        params = self.param_grad_ep_mapping[endpoint]["params"]

        def _get_splited_name_and_shape(varname):
            for idx, splited_param in enumerate(params):
                pname = splited_param.name
                if same_or_split_var(pname, varname) and varname != pname:
                    return pname, splited_param.shape
            return "", []

        # 1. create vars in pserver program to startup program
        pserver_vars = pserver_program.global_block().vars
        created_var_map = collections.OrderedDict()
        for _, var in six.iteritems(pserver_vars):
            tmpvar = s_prog.global_block()._clone_variable(var)
            created_var_map[var.name] = tmpvar

        # 2. rename op outputs
        for op in orig_s_prog.global_block().ops:
            new_outputs = collections.OrderedDict()
            # do not append startup op if var is not on this pserver
            op_on_pserver = False
            # TODO(gongwb): remove this line.
            if op.type not in ["recv", "fetch_barrier", "concat"]:
                for key in op.output_names:
                    newname, _ = _get_splited_name_and_shape(op.output(key)[0])
                    if newname:
                        op_on_pserver = True
                        new_outputs[key] = created_var_map[newname]
                    elif op.output(key)[0] in pserver_vars:
                        op_on_pserver = True
                        new_outputs[key] = pserver_vars[op.output(key)[0]]

            if op_on_pserver:
                # most startup program ops have no inputs
                new_inputs = self._get_input_map_from_op(pserver_vars, op)

                if op.type in [
                        "gaussian_random", "fill_constant", "uniform_random",
                        "truncated_gaussian_random"
                ]:
                    op._set_attr("shape", list(new_outputs["Out"].shape))
                s_prog.global_block().append_op(type=op.type,
                                                inputs=new_inputs,
                                                outputs=new_outputs,
                                                attrs=op.all_attrs())

        return s_prog

    # ====================== private transpiler functions =====================
    def _get_slice_var_info(self, slice_var):
        block_suffix = "block"
        block_idx = 0
        offset = 0
        is_slice = False

        orig_var_name, block_name, _ = self._get_varname_parts(slice_var.name)

        if not block_name:
            return is_slice, block_idx, offset

    def _get_distributed_optimizer_vars(self):
        def _get_distributed_optimizer_var(endpoint):
            opt_op_on_pserver = []
            for _, op in enumerate(self.optimize_ops):
                if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
                        endpoint, op):
                    opt_op_on_pserver.append(op)

            for opt_op in opt_op_on_pserver:
                dist_var = None
                for key in opt_op.input_names:
                    if key == "Param":
                        param_name = opt_op.input(key)[0]
                        dist_var = self.vars_overview.get_distributed_var_by_origin_and_ep(
                            param_name, endpoint)
                        break
                for key in opt_op.input_names:
                    if key in ["Param", "Grad", "LearningRate"]:
                        continue

        for ep in self.pserver_endpoints:
            _get_distributed_optimizer_var(ep)

    def _update_dist_lookup_table_vars(self, param_list, grad_list,
                                       params_grads):
        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
        # update self.table_param_grad and self.trainer_side_table_grad_list
        program = self.origin_program
        return param_list, grad_list

    def _init_splited_vars(self):
        # update these mappings for further transpile:
        # 1. param_var_mapping: param var name -> [splited params vars]
        # 2. grad_var_mapping: grad var name -> [splited grads vars]
        # 3. grad_param_mapping: grad.blockx -> param.blockx
        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}

        param_list = []
        grad_list = []
        param_grad_set = set()
        for p, g in self.params_grads:
            # skip parameter marked not trainable
            if type(p) == Parameter and p.trainable == False:
                continue
            if p.name not in param_grad_set:
                param_list.append(p)
                param_grad_set.add(p.name)
            if g.name not in param_grad_set:
                grad_list.append(g)
                param_grad_set.add(g.name)

        # To do : consider lookup table later
        param_list, grad_list = self._update_dist_lookup_table_vars(
            param_list, grad_list, self.params_grads)

        if self.config.slice_var_up:
            # when we slice var up into blocks, we will slice the var according to
            # pserver services' count. A pserver may have two or more listening ports.
            grad_blocks = slice_variable(grad_list,
                                         len(self.pserver_endpoints),
                                         self.config.min_block_size)
            param_blocks = slice_variable(param_list,
                                          len(self.pserver_endpoints),
                                          self.config.min_block_size)
        assert (len(grad_blocks) == len(param_blocks))

        # origin_param_name -> [splited_param_vars]
        self.param_var_mapping = self._create_vars_from_blocklist(
            self.origin_program, param_blocks)

        for orig_name, splited_vars in self.param_var_mapping.items():
            orig_var = self.origin_program.global_block().var(orig_name)
            for splited_var in splited_vars:
                is_slice, block_id, offset = self._get_slice_var_info(
                    splited_var)

                self.vars_overview.add_distributed_var(origin_var=orig_var,
                                                       slice_var=splited_var,
                                                       block_id=block_id,
                                                       offset=offset,
                                                       is_slice=is_slice,
                                                       vtype="Param")

        # origin_grad_name -> [splited_grad_vars]
        self.grad_var_mapping = self._create_vars_from_blocklist(
            self.origin_program, grad_blocks)
        #add_trainer_suffix=self.trainer_num > 1)
        # dict(grad_splited_var -> param_splited_var)
        self.grad_param_mapping = collections.OrderedDict()
        for g, p in zip(grad_blocks, param_blocks):
            g_name, g_bid, _ = g.split(":")
            p_name, p_bid, _ = p.split(":")
            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
                self.param_var_mapping[p_name][int(p_bid)]

        # create mapping of endpoint -> split var to create pserver side program
        self.param_grad_ep_mapping = collections.OrderedDict()
        [
            self.param_grad_ep_mapping.update({ep: {
                "params": [],
                "opti": []
            }}) for ep in self.pserver_endpoints
        ]

        opti_list = []
        opti_to_param = dict()
        param_to_opti = dict()
        for op in self.optimize_ops:
            if (op.type == "sgd") or (op.type == "adam") or (op.type
                                                             == "momentum"):
                origin_name = op.output("ParamOut")
                var = self.origin_program.global_block().var(origin_name[0])
                new_var_name = "%s.opti.trainer_%d" % (origin_name[0],
                                                       self.trainer_id)
                self.origin_program.global_block().create_var(
                    name=new_var_name,
                    persistable=True,
                    shape=var.shape,
                    dtype=var.dtype,
                    type=var.type,
                    lod_level=var.lod_level)
                new_var = self.origin_program.global_block().var(new_var_name)
                opti_list.append(new_var.name)
                opti_to_param[new_var.name] = var.name
                param_to_opti[var.name] = new_var.name
                self.origin_program.global_block().append_op(
                    type="scale",
                    inputs={"X": var},
                    outputs={"Out": new_var},
                    attrs={"scale": 1.0})
        self._param_to_opti = param_to_opti
        self._opti_to_param = opti_to_param
        self._opti_var_list = opti_list

    def _create_vars_from_blocklist(self,
                                    program,
                                    block_list,
                                    add_trainer_suffix=False):
        """
        Create vars for each split.
        NOTE: only grads need to be named for different trainers, use
              add_trainer_suffix to rename the grad vars.
        Args:
            program (ProgramDesc): ProgramDesc which gradients blong.
            block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
            add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
        Returns:
            var_mapping (collections.OrderedDict(varname->[new_varname_variable])):A dict mapping
                from original var name to each var split.
        """

        # varname->[(block_id, current_block_size)]
        block_map = collections.OrderedDict()

        var_mapping = collections.OrderedDict()
        for block_str in block_list:
            varname, offset, size = block_str.split(":")
            if varname not in block_map:
                block_map[varname] = []
            block_map[varname].append((int(offset), int(size)))

        for varname, splited in six.iteritems(block_map):
            orig_var = program.global_block().var(varname)
            if len(splited) == 1:
                var_mapping[varname] = \
                        [program.global_block().var(orig_var.name)]
                continue
        return var_mapping

    def _clone_var(self, block, var, persistable=True):
        return block.create_var(name=var.name,
                                shape=var.shape,
                                dtype=var.dtype,
                                type=var.type,
                                lod_level=var.lod_level,
                                persistable=persistable)

    def _get_varname_parts(self, varname):
        # returns origin, blockid, trainerid
        orig_var_name = ""
        trainer_part = ""
        block_part = ""
        trainer_idx = varname.find(".trainer_")
        if trainer_idx >= 0:
            trainer_part = varname[trainer_idx + 1:]
        else:
            trainer_idx = len(varname)
        block_index = varname.find(".block")
        if block_index >= 0:
            block_part = varname[block_index + 1:trainer_idx]
        else:
            block_index = len(varname)
        orig_var_name = varname[0:min(block_index, trainer_idx)]
        return orig_var_name, block_part, trainer_part

    def _is_op_connected(self, op1, op2):
        # If one op's input is another op's output or
        # one op's output is another op's input, we say
        # the two operator is connected.
        if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
                set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
            return True
        return False

    def _create_ufind(self, optimize_ops):
        # Create a unit find data struct by optimize ops
        ufind = UnionFind(optimize_ops)
        for i in range(len(optimize_ops)):
            for j in range(i, len(optimize_ops)):
                op1 = optimize_ops[i]
                op2 = optimize_ops[j]
                if self._is_op_connected(op1, op2):
                    ufind.union(op1, op2)
        return ufind

    def _is_optimizer_op(self, op):
        if "Param" in op.input_names and \
                "LearningRate" in op.input_names:
            return True
        return False

    def _is_opt_op_on_pserver(self, endpoint, op):
        param_names = [
            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
        ]
        if op.input("Param")[0] in param_names:
            return True

    def _get_input_map_from_op(self, varmap, op):
        """Returns a dict from op input name to the vars in varmap."""
        iomap = collections.OrderedDict()
        return iomap

    def _get_lr_ops(self):
        lr_ops = []
        block = self.origin_program.global_block()
        for op in block.ops:
            role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
        return lr_ops

    def _is_opt_role_op(self, op):
        # NOTE: depend on oprole to find out whether this op is for
        # optimize
        op_maker = core.op_proto_and_checker_maker
        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
        if op_maker.kOpRoleAttrName() in op.attr_names and \
                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
            return True
        return False

    def _get_optimize_pass(self):
        """
        Get optimizer operators, parameters and gradients from origin_program
        Returns:
            opt_ops (list): optimize operators.
            params_grads (dict): parameter->gradient.
        """
        block = self.origin_program.global_block()
        opt_ops = []
        params_grads = []
        # tmp set to dedup
        optimize_params = set()
        origin_var_dict = self.origin_program.global_block().vars
        for op in block.ops:
            if self._is_opt_role_op(op):
                opt_ops.append(op)
                if op.attr(OP_ROLE_VAR_ATTR_NAME):
                    param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
                    grad_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
                    if not param_name in optimize_params:
                        optimize_params.add(param_name)
                        log("adding param_grad pair: ", param_name, grad_name)
                        params_grads.append([
                            origin_var_dict[param_name],
                            origin_var_dict[grad_name]
                        ])
        return opt_ops, params_grads
Exemplo n.º 30
0
def fit():
    role = role_maker.UserDefinedRoleMaker(
        current_id=current_id,
        role=role_maker.Role.WORKER if bool(1==int(roles)) else role_maker.Role.SERVER,
        worker_num=2,
        server_endpoints=["127.0.0.1:36011"])
    fleet.init(role)
    BATCH_SIZE = 128
    type_size=createDataList(in_file_path,in_file_path+'.data'+"/")
    # 用于训练的数据提供器
    train_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/trainer.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE)
    test_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/test.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE)
    data_shape = [3, 32, 32]
    images = fluid.layers.data(name='images', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # 获取分类器
    predict = networkConfiguration(images,type_size)

    # 定义损失函数和准确率
    cost = fluid.layers.cross_entropy(input=predict, label=label)   # 交叉熵
    avg_cost = fluid.layers.mean(cost)                              # 计算cost中所有元素的平均值
    acc = fluid.layers.accuracy(input=predict, label=label)         # 使用输入和标签计算准确率

    # 定义优化方法
    test_program = fluid.default_main_program().clone(for_test=True)    # 获取测试程序
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = True
    optimizer = fleet.distributed_optimizer(optimizer,strategy)
    # 定义优化方法
    optimizer.minimize(avg_cost)

    if fleet.is_server():
        print("启动server")
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        print("启动worker")
        fleet.init_worker()
        print(fleet.worker_endpoints())
        ########## 模型训练&模型评估 ##########
        # 创建Executor
        use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        print("cpu")
        # 定义数据映射器
        feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
        print("数据映射")
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        for pass_id in range(EPOCH_NUM):
            print(pass_id)
            # 开始训练
            for batch_id, data in enumerate(train_reader()):                            # 遍历train_reader
                train_cost, train_acc = exe.run(program=fluid.default_main_program(),   # 运行主程序
                                                feed=feeder.feed(data),                 # 喂入一个batch的数据
                                                fetch_list=[avg_cost, acc])             # fetch均方误差和准确率         # fetch均方误差和准确率
                # 每100次batch打印一次训练、进行一次测试
                if batch_id % 20 == 0:
                    print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' %(pass_id, batch_id, train_cost[0], train_acc[0]))
            # 开始测试
            test_costs = [] # 测试的损失值
            test_accs = []  # 测试的准确率
            for batch_id, data in enumerate(test_reader()):
                test_cost, test_acc = exe.run(program=test_program,         # 执行训练程序
                                            feed=feeder.feed(data),       # 喂入数据
                                            fetch_list=[avg_cost, acc])   # fetch误差、准确率
                test_costs.append(test_cost[0])                             # 记录每个batch的损失值
                test_accs.append(test_acc[0])                               # 记录每个batch的准确率

            test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值
            test_acc = (sum(test_accs) / len(test_accs))    # 计算准确率平均值
            print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc))
        save(predict,model_file_path,exe)
        fleet.stop_worker()