Exemplo n.º 1
0
 def test_pslib_1(self):
     """Test cases for pslib."""
     import paddle.fluid as fluid
     from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
     from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
     from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
     try:
         import netifaces
     except:
         print("warning: no netifaces, skip test_pslib_1")
         return
     os.environ["POD_IP"] = "127.0.0.1"
     os.environ["PADDLE_PORT"] = "36001"
     os.environ["TRAINING_ROLE"] = "TRAINER"
     os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
     os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
     os.environ["PADDLE_TRAINER_ID"] = "0"
     role_maker = GeneralRoleMaker()
     role_maker.generate_role()
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     fleet.init(role_maker)
     train_program = fluid.Program()
     startup_program = fluid.Program()
     scope = fluid.Scope()
     with fluid.program_guard(train_program, startup_program):
         show = fluid.layers.data(name="show", shape=[-1, 1], \
             dtype="float32", lod_level=1, append_batch_size=False)
         fc = fluid.layers.fc(input=show, size=1, act=None)
         label = fluid.layers.data(name="click", shape=[-1, 1], \
             dtype="int64", lod_level=1, append_batch_size=False)
         label_cast = fluid.layers.cast(label, dtype='float32')
         cost = fluid.layers.log_loss(fc, label_cast)
     try:
         adam = fluid.optimizer.Adam(learning_rate=0.000005)
         adam = fleet.distributed_optimizer(adam)
         adam.minimize([cost], [scope])
         fleet.run_server()
     except:
         print("do not support pslib test, skip")
         return
     fleet.clear_one_table(0)
     from paddle.fluid.incubate.fleet.base.role_maker import \
         MPISymetricRoleMaker
     try:
         role = MPISymetricRoleMaker()
         role._all_reduce([1], [2])
     except:
         print("catch expected error of not inited")
     try:
         role = MPISymetricRoleMaker()
         role._all_reduce([1], [2], "min")
     except:
         print("catch expected error of not inited")
     try:
         role = MPISymetricRoleMaker()
         role._all_reduce([1], [2], "max")
     except:
         print("catch expected error of not inited")
     try:
         role = MPISymetricRoleMaker()
         role._all_reduce([1], [2], "unknown")
     except:
         print("catch expected error of unknown type")
Exemplo n.º 2
0
    def do_training(self, args=None):
        """do training"""
        avg_cost = self.net()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        fleet.init()
        # optimizer
        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
        # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
        optimizer = fleet.distributed_optimizer(optimizer, strategy={"fleet_desc_file": "./thirdparty/pslib/fleet_desc.prototxt"})
        optimizer.minimize(avg_cost)
        train_info = []
        # 启动server
        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            train_data_path = 'thirdparty/data/dist_data/pslib/train_data'
            train_data_files = []
            for filename in os.listdir(train_data_path):
                train_data_files.append(os.path.join(train_data_path, filename))
            # fleet dataset
            label = fluid.layers.data(name="click", shape=[-1, 1], dtype="int64", lod_level=1, append_batch_size=False)
            data = fluid.layers.data(name="1", shape=[1], dtype="int64", lod_level=1)
            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
            dataset.set_use_var([label, data])
            dataset.set_pipe_command("./python/bin/python ./thirdparty/pslib/dataset_generator.py")
            dataset.set_batch_size(32)
            dataset.set_thread(3)
            dataset.set_filelist(train_data_files)
            # 把数据读到内存
            dataset.load_into_memory()
            # 本地shuffle
            dataset.local_shuffle()
            # 初始化worker配置
            fleet.init_worker()
            exe.run(fluid.default_startup_program())
            PASS_NUM = 1
            for pass_id in range(PASS_NUM):
                var_dict = {"loss": avg_cost}
                global var_dict

                class FetchVars(fluid.executor.FetchHandler):
                    def __init__(self, var_dict=None, period_secs=2):
                        super(FetchVars, self).__init__(var_dict, period_secs=2)

                    def handler(self, res_dict):
                        train_info.extend(res_dict["loss"])
                        print(train_info)

                exe.train_from_dataset(
                    program=fluid.default_main_program(),
                    dataset=dataset,
                    fetch_handler=FetchVars(var_dict))
            dataset.release_memory()
            fleet.shrink_sparse_table()
            fleet.shrink_dense_table(0.01, 11)
            fleet.print_table_stat(0)
            fleet.clear_one_table(0)
            fleet.clear_model()
        fleet.stop_worker()
        return train_info