def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) fc = fluid.layers.fc(input=emb, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) strategy = {} strategy["embedding"] = {} strategy["embedding"]["sparse_accessor_class"] = "DownpourUnitAccessor" strategy["embedding"]["embed_sparse_optimizer"] = "naive" try: adam1 = fluid.optimizer.Adam(learning_rate=0.000005) adam1 = fleet.distributed_optimizer(adam1, strategy=strategy) adam1.minimize([cost], [scope]) strategy["embedding"]["embed_sparse_optimizer"] = "adagrad" adam2 = fluid.optimizer.Adam(learning_rate=0.000005) adam2 = fleet.distributed_optimizer(adam2, strategy=strategy) adam2.minimize([cost], [scope]) strategy["embedding"]["embed_sparse_optimizer"] = "adam" adam3 = fluid.optimizer.Adam(learning_rate=0.000005) adam3 = fleet.distributed_optimizer(adam3, strategy=strategy) adam3.minimize([cost], [scope]) except: print("do not support pslib test, skip") return
def processor_register(self): role = MPISymetricRoleMaker() fleet.init(role) if fleet.is_server(): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('server_pass', self.server) else: self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('train_pass', self.train) self.regist_context_processor('terminal_pass', self.terminal)
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker( init_timeout_seconds=100, run_timeout_seconds=100, http_ip_port="127.0.0.1:36003") role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() http_server_d = {} http_server_d["running"] = False size_d = {} role_maker._GeneralRoleMaker__start_kv_server(http_server_d, size_d) except: print("do not support pslib test, skip") return
def test_dataset_fleet2(self): """ Testcase for InMemoryDataset from create to run. """ with open("test_in_memory_dataset2_run2_a.txt", "w") as f: data = "1 1 2 3 3 4 5 5 5 5 1 1\n" data += "1 2 2 3 4 4 6 6 6 6 1 2\n" data += "1 3 2 3 5 4 7 7 7 7 1 3\n" f.write(data) with open("test_in_memory_dataset2_run2_b.txt", "w") as f: data = "1 4 2 3 3 4 5 5 5 5 1 4\n" data += "1 5 2 3 4 4 6 6 6 6 1 5\n" data += "1 6 2 3 5 4 7 7 7 7 1 6\n" data += "1 7 2 3 6 4 8 8 8 8 1 7\n" f.write(data) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet with fluid.program_guard(train_program, startup_program): slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"] slots_vars = [] for slot in slots: var = fluid.layers.data(\ name=slot, shape=[1], dtype="float32", lod_level=1) slots_vars.append(var) fake_cost = \ fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1]) fake_cost = fluid.layers.mean(fake_cost) with fluid.scope_guard(scope): place = fluid.CPUPlace() exe = fluid.Executor(place) try: fleet.init() except ImportError as e: print("warning: no mpi4py") adam = fluid.optimizer.Adam(learning_rate=0.000005) try: adam = fleet.distributed_optimizer( adam, strategy={ "fs_uri": "fs_uri_xxx", "fs_user": "******", "fs_passwd": "fs_passwd_xxx", "fs_hadoop_bin": "fs_hadoop_bin_xxx" }) adam.minimize([fake_cost], [scope]) except AttributeError as e: print("warning: no mpi") except ImportError as e: print("warning: no mpi4py") exe.run(startup_program) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([ "test_in_memory_dataset2_run2_a.txt", "test_in_memory_dataset2_run2_b.txt" ]) dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) dataset.load_into_memory() try: dataset.global_shuffle(fleet) except: print("warning: catch expected error") fleet._opt_info = None fleet._fleet_ptr = None os.remove("./test_in_memory_dataset2_run2_a.txt") os.remove("./test_in_memory_dataset2_run2_b.txt")
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return fleet.clear_one_table(0) from paddle.fluid.incubate.fleet.base.role_maker import \ MPISymetricRoleMaker try: role = MPISymetricRoleMaker() role._all_reduce([1], [2]) except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "min") except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "max") except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "unknown") except: print("catch expected error of unknown type")
def test_pslib_2(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase try: import netifaces except: print("warning: no netifaces, skip test_pslib_2") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "1" place = fluid.CPUPlace() exe = fluid.Executor(place) try: fleet.init(None) except: print("no mpi4py, skip test_pslib_2") return train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return os.environ["TRAINING_ROLE"] = "wrong" try: role1 = GeneralRoleMaker(path="./test_gloo_1") role1.generate_role() except: print("catch expected error of wrong TRAINING_ROLE") os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" role2 = GeneralRoleMaker(path="./test_gloo_2") role2._finalize() role2._all_gather(1) role2._all_gather(1) role2._barrier_server() role2.all_gather(1) role3 = GeneralRoleMaker(path="./test_gloo_3") role3._worker_gather(1) role3._worker_gather(1) os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" role4 = GeneralRoleMaker(path="./test_gloo_4") role4._worker_gather(1) role4._get_rank() role4._get_size() role4._all_comm.init() role5 = GeneralRoleMaker(path="./test_gloo_5") role5.get_local_endpoint() role5.get_local_endpoint() role6 = GeneralRoleMaker(path="./test_gloo_6") role6.get_trainer_endpoints() role6.get_trainer_endpoints() role7 = GeneralRoleMaker(path="./test_gloo_7") role7.get_pserver_endpoints() role7.get_pserver_endpoints() role8 = GeneralRoleMaker(path="./test_gloo_8") role8.is_worker() role8.is_worker() role9 = GeneralRoleMaker(path="./test_gloo_9") role9.is_server() role9.is_server() role10 = GeneralRoleMaker(path="./test_gloo_10") role10.is_first_worker() role10.is_first_worker() role11 = GeneralRoleMaker(path="./test_gloo_11") role11.worker_index() role11.worker_index() role12 = GeneralRoleMaker(path="./test_gloo_12") role12.server_index() role12.server_index() role13 = GeneralRoleMaker(path="./test_gloo_13") role13.worker_num() role13.worker_num() role14 = GeneralRoleMaker(path="./test_gloo_14") role14.server_num() role14.server_num() role15 = GeneralRoleMaker(path="./test_gloo_15") role15._barrier_worker() role15._barrier_worker() role16 = GeneralRoleMaker(path="./test_gloo_16") role16._barrier_all() role16._barrier_all() role17 = GeneralRoleMaker(path="./test_gloo_17") role17._barrier_server() role17._barrier_server() role18 = GeneralRoleMaker(path="./test_gloo_18") role18._worker_num() role18._worker_num() role19 = GeneralRoleMaker(path="./test_gloo_19") role19._server_num() role19._server_num() role20 = GeneralRoleMaker(path="./test_gloo_20") a = [1] b = [0] role20._all_reduce(a, b) role21 = GeneralRoleMaker(path="./test_gloo_21") role21.all_reduce_worker([], []) role21.all_reduce_worker([], []) role21.barrier_worker() role21.barrier_all() role22 = GeneralRoleMaker(path="./test_gloo_22") role22._get_rank() role22._get_rank() os.environ["PADDLE_PSERVER_ID"] = "0" role23 = GeneralRoleMaker(path="./test_gloo_23") role23._get_size() role23._get_size() with open("test_fleet_gloo_role_maker_1.txt", "w") as f: data = "1 1 1 1\n" f.write(data) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"]) dataset.set_use_var([show, label]) dataset.load_into_memory() dataset.get_memory_data_size(fleet) dataset.get_shuffle_data_size(fleet) os.remove("./test_fleet_gloo_role_maker_1.txt") class TmpClass(): """ dummy tmp class """ def __init__(self): pass def all_reduce_worker(self, input, output): """ dummy all reduce worker Args: input(None): fake input output(None): fale output """ pass def barrier_worker(self): """ dummy barrier worker """ pass from paddle.fluid.incubate.fleet.base.fleet_base import Fleet class TmpFleet(Fleet): """ dummy tmp fleet """ def __init__(self): super(Fleet, self).__init__() self._role_maker = None def init_worker(self): """ dummy init worker """ pass def init_server(self, model_dir=None): """ dummy init server Args: model_dir(None): fake model_dir """ pass def run_server(self): """ dummy run server """ pass def stop_worker(self): """ dummy stop worker """ pass def distributed_optimizer(self, optimizer, strategy=None): """ dummy distributed optimizer Args: optimizer(None): fake optimizer strategy(None): fake strategy """ pass def save_inference_model(self): """ dummy save inference model """ pass def save_persistables(self): """ dummy save persistables """ pass os.environ["TRAINING_ROLE"] = "TRAINER" tmp = TmpFleet() tmp._role_maker = TmpClass() tmp.all_reduce_worker([], []) tmp.barrier_worker() from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker tmp = RoleMakerBase() tmp.all_gather(1) tmp.all_reduce_worker([], []) tmp.barrier_worker() tmp.barrier_all() from paddle.fluid.incubate.fleet.base.role_maker import \ MPISymetricRoleMaker tmp1 = MPISymetricRoleMaker() tmp1.all_gather(1) tmp1.all_gather(1) tmp2 = MPISymetricRoleMaker() tmp2.all_reduce_worker([], []) tmp3 = MPISymetricRoleMaker() tmp3.barrier_worker() tmp3.barrier_worker() tmp4 = MPISymetricRoleMaker() tmp4.barrier_all() tmp4.barrier_all()
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import \ fleet_embedding, _prepare_params, _fleet_embedding, \ _fleet_embedding_v2, FLEET_GLOBAL_DICT from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() global FLEET_GLOBAL_DICT with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) click = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) with fleet_embedding(click_name=click.name): emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) emb = fluid.layers.data_norm(input=emb, name="a", epsilon=1e-4, param_attr={ "batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4 }) fc = fluid.layers.fc(input=emb, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer( adam, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor" } }) adam.minimize([cost], [scope]) except: print("do not support pslib test, skip") return FLEET_GLOBAL_DICT["cur_accessor"] = "DownpourCtrAccessor" try: _prepare_params(input=show, size=[1, 1]) except: print("catch expected exception of param_attr=None") try: _prepare_params(input=show, size=[1, 1], param_attr=fluid.ParamAttr()) except: print("catch expected exception of name=None") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=1, param_attr=tmp) except: print("catch expected exception of size not list") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 12], param_attr=tmp) except: print("catch expected exception of size not equal") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp, is_sparse=False) except: print("catch expected exception of is_sparse=False") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp, \ is_sparse=True, is_distributed=False) except: print("catch expected exception of is_distributed=False") try: _prepare_params(input=show, size=[-1, 1], \ param_attr=fluid.ParamAttr(name="embedding"), \ is_sparse=True, is_distributed=True, dtype="abc") except: print("catch expected exception of unknown dtype") try: FLEET_GLOBAL_DICT["emb_to_accessor"]["embedding"] = "unknown" tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp) except: print("catch expected exception of unknown accessor") FLEET_GLOBAL_DICT["cur_accessor"] = "DownpourCtrAccessor" try: _fleet_embedding(input=show, size=[-1, 1], is_sparse=True, \ is_distributed=True, dtype="float32", \ param_attr=fluid.ParamAttr(name="embedding")) except: print("catch expected exception of unknown accessor") try: _fleet_embedding_v2(input=show, size=[-1, 1], is_sparse=True, \ is_distributed=True, dtype="float32", \ param_attr=fluid.ParamAttr(name="embedding")) except: print("catch expected exception of unknown accessor") adam1 = fluid.optimizer.Adam(learning_rate=0.000005) adam1 = fleet.distributed_optimizer( adam1, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor" } }) try: pre = FLEET_GLOBAL_DICT["emb_to_table"] FLEET_GLOBAL_DICT["emb_to_table"] = {} adam1.minimize([cost], [scope]) except: FLEET_GLOBAL_DICT["emb_to_table"] = pre print("catch expected exception of empty emb_to_table") try: pre = FLEET_GLOBAL_DICT["emb_to_table"] FLEET_GLOBAL_DICT["emb_to_table"] = {} FLEET_GLOBAL_DICT["emb_to_table"]["emb1"] = 0 adam1.minimize([cost], [scope]) except: FLEET_GLOBAL_DICT["emb_to_table"] = pre print("catch expected exception of error emb_to_table") try: adam2 = fluid.optimizer.Adam(learning_rate=0.000005) adam2 = fleet.distributed_optimizer(adam2) adam2.supported_embedding_types = [] adam2.minimize([cost], [scope]) except: print("catch expected exception of embedding_types") try: adam3 = fluid.optimizer.Adam(learning_rate=0.000005) adam3 = fleet.distributed_optimizer( adam3, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor", "sparse_embedx_dim": 999 } }) adam3.minimize([cost], [scope]) except: print("catch expected exception of embedx_dim error") try: adam4 = fluid.optimizer.Adam(learning_rate=0.000005) adam4 = fleet.distributed_optimizer( adam4, strategy={ "embedding": { "sparse_accessor_class": "DownpourCtrAccessor", "sparse_embedx_dim": 999 } }) adam4.minimize([cost], [scope]) except: print("catch expected exception of embedx_dim error") train_program1 = fluid.Program() startup_program1 = fluid.Program() FLEET_GLOBAL_DICT["emb_to_accessor"] = {} with fluid.program_guard(train_program1, startup_program1): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) with fleet_embedding(click_name=click.name): emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) with fleet_embedding(click_name=click.name): emb1 = fluid.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding"))
key_num = fleet_util.save_cache_model(config.output_path, day, pass_index) fleet_util.write_cache_donefile(config.output_path, day, pass_index, key_num, config.fs_name, config.fs_ugi) end = time.time() fleet_util.rank0_print("end save cache cost %s min, key_num=%s" % ((end - begin) / 60.0, key_num)) fleet_util.write_xbox_donefile(config.output_path, day, pass_index, xbox_base_key, ",".join(cur_path), config.fs_name, config.fs_ugi) if __name__ == "__main__": place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(exe) slot_file = "slot/slot" slot_common_file = "slot/slot_common" all_slot_file = "all_slot.dict" join_common_model, update_model = create_model(slot_file, slot_common_file, all_slot_file) scope2 = fluid.Scope() scope3 = fluid.Scope() adjust_ins_weight = { "need_adjust": True, "nid_slot": "6002", "nid_adjw_threshold": 1000, "nid_adjw_ratio": 20, "ins_weight_slot": update_model.ins_weight.name }
def init(self, context): """R """ role_maker = None if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu': afs_config = self.global_config['io']['afs'] role_maker = GeneralRoleMaker( hdfs_name=afs_config['fs_name'], hdfs_ugi=afs_config['fs_ugi'], path=self.global_config['output_path'] + "/gloo", init_timeout_seconds=1200, run_timeout_seconds=1200) fleet.init(role_maker) data_var_list = [] data_var_name_dict = {} runnnable_scope = [] runnnable_cost_op = [] context['status'] = 'startup' for executor in self.global_config['executor']: scope = fluid.Scope() self._exector_context[executor['name']] = {} self._exector_context[executor['name']]['scope'] = scope self._exector_context[ executor['name']]['model'] = model_basic.create(executor) model = self._exector_context[executor['name']]['model'] self._metrics.update(model.get_metrics()) runnnable_scope.append(scope) runnnable_cost_op.append(model.get_cost_op()) for var in model._data_var: if var.name in data_var_name_dict: continue data_var_list.append(var) data_var_name_dict[var.name] = var optimizer = model_basic.YamlModel.build_optimizer({ 'metrics': self._metrics, 'optimizer_conf': self.global_config['optimizer'] }) optimizer.minimize(runnnable_cost_op, runnnable_scope) for executor in self.global_config['executor']: scope = self._exector_context[executor['name']]['scope'] model = self._exector_context[executor['name']]['model'] program = model._build_param['model']['train_program'] if not executor['is_update_sparse']: program._fleet_opt["program_configs"][str( id(model.get_cost_op().block.program) )]["push_sparse"] = [] if 'train_thread_num' not in executor: executor['train_thread_num'] = self.global_config[ 'train_thread_num'] with fluid.scope_guard(scope): self._exe.run(model._build_param['model']['startup_program']) model.dump_model_program('./') # server init done if fleet.is_server(): return 0 self._dataset = {} for dataset_item in self.global_config['dataset']['data_list']: dataset_item['data_vars'] = data_var_list dataset_item.update(self.global_config['io']['afs']) dataset_item["batch_size"] = self.global_config['batch_size'] self._dataset[dataset_item[ 'name']] = dataset.FluidTimeSplitDataset(dataset_item) # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass: # util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3) fleet.init_worker() pass
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) fc = fluid.layers.fc(input=emb, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam, strategy={ "embedding": { "sparse_accessor_class": "DownpourCtrAccessor" } }) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return try: # worker should call these methods instead of server # the following is only for test when with_pslib=off def test_func(): """ it is only a test function """ return True fleet._role_maker.is_first_worker = test_func fleet._role_maker._barrier_worker = test_func fleet.save_model("./model_000") fleet.save_one_table(0, "./model_001") fleet.save_one_table(0, "./model_002", prefix="hahaha") fleet.load_model("./model_0003") fleet.load_one_table(0, "./model_004") fleet.confirm() fleet.revert() except: print("do not support pslib test, skip") return
def do_training(self, args=None): """do training""" avg_cost = self.net() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init() # optimizer optimizer = fluid.optimizer.Adam(learning_rate=0.001) # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 optimizer = fleet.distributed_optimizer(optimizer, strategy={"fleet_desc_file": "./thirdparty/pslib/fleet_desc.prototxt"}) optimizer.minimize(avg_cost) train_info = [] # 启动server if fleet.is_server(): fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): train_data_path = 'thirdparty/data/dist_data/pslib/train_data' train_data_files = [] for filename in os.listdir(train_data_path): train_data_files.append(os.path.join(train_data_path, filename)) # fleet dataset label = fluid.layers.data(name="click", shape=[-1, 1], dtype="int64", lod_level=1, append_batch_size=False) data = fluid.layers.data(name="1", shape=[1], dtype="int64", lod_level=1) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_use_var([label, data]) dataset.set_pipe_command("./python/bin/python ./thirdparty/pslib/dataset_generator.py") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist(train_data_files) # 把数据读到内存 dataset.load_into_memory() # 本地shuffle dataset.local_shuffle() # 初始化worker配置 fleet.init_worker() exe.run(fluid.default_startup_program()) PASS_NUM = 1 for pass_id in range(PASS_NUM): var_dict = {"loss": avg_cost} global var_dict class FetchVars(fluid.executor.FetchHandler): def __init__(self, var_dict=None, period_secs=2): super(FetchVars, self).__init__(var_dict, period_secs=2) def handler(self, res_dict): train_info.extend(res_dict["loss"]) print(train_info) exe.train_from_dataset( program=fluid.default_main_program(), dataset=dataset, fetch_handler=FetchVars(var_dict)) dataset.release_memory() fleet.shrink_sparse_table() fleet.shrink_dense_table(0.01, 11) fleet.print_table_stat(0) fleet.clear_one_table(0) fleet.clear_model() fleet.stop_worker() return train_info
def test_in_memory_dataset_run_fleet(self): """ Testcase for InMemoryDataset from create to run. """ with open("test_in_memory_dataset_run_fleet_a.txt", "w") as f: data = "1 1 1 2 2 3 3 4 5 5 5 5 1 1\n" data += "1 0 1 3 2 3 4 4 6 6 6 6 1 2\n" data += "1 1 1 4 2 3 5 4 7 7 7 7 1 3\n" f.write(data) with open("test_in_memory_dataset_run_fleet_b.txt", "w") as f: data = "1 0 1 5 2 3 3 4 5 5 5 5 1 4\n" data += "1 1 1 6 2 3 4 4 6 6 6 6 1 5\n" data += "1 0 1 7 2 3 5 4 7 7 7 7 1 6\n" data += "1 1 1 8 2 3 6 4 8 8 8 8 1 7\n" f.write(data) slots = ["click", "slot1", "slot2", "slot3", "slot4"] slots_vars = [] for slot in slots: var = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) click = slots_vars[0] embs = [] for slot in slots_vars[1:3]: with fleet_embedding(click_name=click.name): emb = fluid.layers.embedding(input=slot, size=[-1, 11], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) embs.append(emb) for slot in slots_vars[3:5]: with fleet_embedding(click_name=click.name): emb = fluid.embedding(input=slot, size=[-1, 11], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) emb = fluid.layers.reshape(emb, [-1, 11]) embs.append(emb) concat = fluid.layers.concat([embs[0], embs[3]], axis=1) fc = fluid.layers.fc(input=concat, size=1, act=None) label_cast = fluid.layers.cast(slots_vars[1], dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) cost = fluid.layers.mean(cost) try: fleet.init() adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) scope = fluid.Scope() adam.minimize([cost], [scope]) except: print("do not support pslib test, skip") return dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_batch_size(1) dataset.set_thread(2) dataset.set_filelist([ "test_in_memory_dataset_run_fleet_a.txt", "test_in_memory_dataset_run_fleet_b.txt" ]) dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) dataset.load_into_memory() exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) exe.train_from_dataset(fluid.default_main_program(), dataset) fleet._opt_info["stat_var_names"] = ["233"] exe.infer_from_dataset(fluid.default_main_program(), dataset) fleet._opt_info = None fleet._fleet_ptr = None os.remove("./test_in_memory_dataset_run_fleet_a.txt") os.remove("./test_in_memory_dataset_run_fleet_b.txt")