def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) fc = fluid.layers.fc(input=emb, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) strategy = {} strategy["embedding"] = {} strategy["embedding"]["sparse_accessor_class"] = "DownpourUnitAccessor" strategy["embedding"]["embed_sparse_optimizer"] = "naive" try: adam1 = fluid.optimizer.Adam(learning_rate=0.000005) adam1 = fleet.distributed_optimizer(adam1, strategy=strategy) adam1.minimize([cost], [scope]) strategy["embedding"]["embed_sparse_optimizer"] = "adagrad" adam2 = fluid.optimizer.Adam(learning_rate=0.000005) adam2 = fleet.distributed_optimizer(adam2, strategy=strategy) adam2.minimize([cost], [scope]) strategy["embedding"]["embed_sparse_optimizer"] = "adam" adam3 = fluid.optimizer.Adam(learning_rate=0.000005) adam3 = fleet.distributed_optimizer(adam3, strategy=strategy) adam3.minimize([cost], [scope]) except: print("do not support pslib test, skip") return
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker( init_timeout_seconds=100, run_timeout_seconds=100, http_ip_port="127.0.0.1:36003") #role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) #fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() http_server_d = {} http_server_d["running"] = False size_d = {} role_maker._GeneralRoleMaker__start_kv_server(http_server_d, size_d) except: print("do not support pslib test, skip") return from paddle.fluid.incubate.fleet.base.role_maker import MockBarrier mb = MockBarrier() mb.barrier() mb.barrier_all() mb.all_reduce(1) mb.all_gather(1) os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36005" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36005" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36006" os.environ["PADDLE_IS_BARRIER_ALL_ROLE"] = "0" role_maker = GeneralRoleMaker(path="test_mock1") role_maker.generate_role()
def build_optimizer(self, params): """R """ optimizer_conf = params['optimizer_conf'] strategy = None if 'strategy' in optimizer_conf: strategy = optimizer_conf['strategy'] stat_var_names = [] metrics = params['metrics'] for name in metrics: model_metrics = metrics[name] stat_var_names += [ model_metrics[metric]['var'].name for metric in model_metrics ] strategy['stat_var_names'] = list(set(stat_var_names)) optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \ '(learning_rate=' + str(optimizer_conf['learning_rate']) + ')' exec(optimizer_generator) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) return optimizer
def init(self, context): """R """ self.model.train_net() optimizer = self.model.optimizer() optimizer = fleet.distributed_optimizer(optimizer, strategy={"use_cvm": False}) optimizer.minimize(self.model.get_avg_cost()) if fleet.is_server(): context['status'] = 'server_pass' else: self.fetch_vars = [] self.fetch_alias = [] self.fetch_period = self.model.get_fetch_period() metrics = self.model.get_metrics() if metrics: self.fetch_vars = metrics.values() self.fetch_alias = metrics.keys() context['status'] = 'train_pass'
def test_dataset_fleet2(self): """ Testcase for InMemoryDataset from create to run. """ with open("test_in_memory_dataset2_run2_a.txt", "w") as f: data = "1 1 2 3 3 4 5 5 5 5 1 1\n" data += "1 2 2 3 4 4 6 6 6 6 1 2\n" data += "1 3 2 3 5 4 7 7 7 7 1 3\n" f.write(data) with open("test_in_memory_dataset2_run2_b.txt", "w") as f: data = "1 4 2 3 3 4 5 5 5 5 1 4\n" data += "1 5 2 3 4 4 6 6 6 6 1 5\n" data += "1 6 2 3 5 4 7 7 7 7 1 6\n" data += "1 7 2 3 6 4 8 8 8 8 1 7\n" f.write(data) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet with fluid.program_guard(train_program, startup_program): slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"] slots_vars = [] for slot in slots: var = fluid.layers.data(\ name=slot, shape=[1], dtype="float32", lod_level=1) slots_vars.append(var) fake_cost = \ fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1]) fake_cost = fluid.layers.mean(fake_cost) with fluid.scope_guard(scope): place = fluid.CPUPlace() exe = fluid.Executor(place) try: fleet.init() except ImportError as e: print("warning: no mpi4py") adam = fluid.optimizer.Adam(learning_rate=0.000005) try: adam = fleet.distributed_optimizer( adam, strategy={ "fs_uri": "fs_uri_xxx", "fs_user": "******", "fs_passwd": "fs_passwd_xxx", "fs_hadoop_bin": "fs_hadoop_bin_xxx" }) adam.minimize([fake_cost], [scope]) except AttributeError as e: print("warning: no mpi") except ImportError as e: print("warning: no mpi4py") exe.run(startup_program) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([ "test_in_memory_dataset2_run2_a.txt", "test_in_memory_dataset2_run2_b.txt" ]) dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) dataset.load_into_memory() try: dataset.global_shuffle(fleet) except: print("warning: catch expected error") fleet._opt_info = None fleet._fleet_ptr = None os.remove("./test_in_memory_dataset2_run2_a.txt") os.remove("./test_in_memory_dataset2_run2_b.txt")
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() #role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) #fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') bow = fluid.layers.data_norm(input=bow, epsilon=1e-4, name="norm") fc = fluid.layers.fc(input=bow, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer( adam, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor" } }) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return try: # worker should call these methods instead of server # the following is only for test when with_pslib=off def test_func(): """ it is only a test function """ return True fleet._role_maker.is_first_worker = test_func fleet._role_maker._barrier_worker = test_func fleet.save_model("./model_000") fleet.save_one_table(0, "./model_001") fleet.save_one_table(0, "./model_002", prefix="hahaha") fleet.load_model("./model_0003") fleet.load_one_table(0, "./model_004") except: print("do not support pslib test, skip") return
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return fleet.clear_one_table(0) from paddle.fluid.incubate.fleet.base.role_maker import \ MPISymetricRoleMaker try: role = MPISymetricRoleMaker() role._all_reduce([1], [2]) except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "min") except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "max") except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "unknown") except: print("catch expected error of unknown type")
def test_pslib_2(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase try: import netifaces except: print("warning: no netifaces, skip test_pslib_2") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "1" place = fluid.CPUPlace() exe = fluid.Executor(place) try: fleet.init(None) except: print("no mpi4py, skip test_pslib_2") return train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return os.environ["TRAINING_ROLE"] = "wrong" try: role1 = GeneralRoleMaker(path="./test_gloo_1") role1.generate_role() except: print("catch expected error of wrong TRAINING_ROLE") os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" role2 = GeneralRoleMaker(path="./test_gloo_2") role2._finalize() role2._all_gather(1) role2._all_gather(1) role2._barrier_server() role2.all_gather(1) role3 = GeneralRoleMaker(path="./test_gloo_3") role3._worker_gather(1) role3._worker_gather(1) os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" role4 = GeneralRoleMaker(path="./test_gloo_4") role4._worker_gather(1) role4._get_rank() role4._get_size() role4._all_comm.init() role5 = GeneralRoleMaker(path="./test_gloo_5") role5.get_local_endpoint() role5.get_local_endpoint() role6 = GeneralRoleMaker(path="./test_gloo_6") role6.get_trainer_endpoints() role6.get_trainer_endpoints() role7 = GeneralRoleMaker(path="./test_gloo_7") role7.get_pserver_endpoints() role7.get_pserver_endpoints() role8 = GeneralRoleMaker(path="./test_gloo_8") role8.is_worker() role8.is_worker() role9 = GeneralRoleMaker(path="./test_gloo_9") role9.is_server() role9.is_server() role10 = GeneralRoleMaker(path="./test_gloo_10") role10.is_first_worker() role10.is_first_worker() role11 = GeneralRoleMaker(path="./test_gloo_11") role11.worker_index() role11.worker_index() role12 = GeneralRoleMaker(path="./test_gloo_12") role12.server_index() role12.server_index() role13 = GeneralRoleMaker(path="./test_gloo_13") role13.worker_num() role13.worker_num() role14 = GeneralRoleMaker(path="./test_gloo_14") role14.server_num() role14.server_num() role15 = GeneralRoleMaker(path="./test_gloo_15") role15._barrier_worker() role15._barrier_worker() role16 = GeneralRoleMaker(path="./test_gloo_16") role16._barrier_all() role16._barrier_all() role17 = GeneralRoleMaker(path="./test_gloo_17") role17._barrier_server() role17._barrier_server() role18 = GeneralRoleMaker(path="./test_gloo_18") role18._worker_num() role18._worker_num() role19 = GeneralRoleMaker(path="./test_gloo_19") role19._server_num() role19._server_num() role20 = GeneralRoleMaker(path="./test_gloo_20") a = [1] b = [0] role20._all_reduce(a, b) role21 = GeneralRoleMaker(path="./test_gloo_21") role21.all_reduce_worker([], []) role21.all_reduce_worker([], []) role21.barrier_worker() role21.barrier_all() role22 = GeneralRoleMaker(path="./test_gloo_22") role22._get_rank() role22._get_rank() os.environ["PADDLE_PSERVER_ID"] = "0" role23 = GeneralRoleMaker(path="./test_gloo_23") role23._get_size() role23._get_size() with open("test_fleet_gloo_role_maker_1.txt", "w") as f: data = "1 1 1 1\n" f.write(data) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"]) dataset.set_use_var([show, label]) dataset.load_into_memory() dataset.get_memory_data_size(fleet) dataset.get_shuffle_data_size(fleet) os.remove("./test_fleet_gloo_role_maker_1.txt") class TmpClass(): """ dummy tmp class """ def __init__(self): pass def all_reduce_worker(self, input, output): """ dummy all reduce worker Args: input(None): fake input output(None): fale output """ pass def barrier_worker(self): """ dummy barrier worker """ pass from paddle.fluid.incubate.fleet.base.fleet_base import Fleet class TmpFleet(Fleet): """ dummy tmp fleet """ def __init__(self): super(Fleet, self).__init__() self._role_maker = None def init_worker(self): """ dummy init worker """ pass def init_server(self, model_dir=None): """ dummy init server Args: model_dir(None): fake model_dir """ pass def run_server(self): """ dummy run server """ pass def stop_worker(self): """ dummy stop worker """ pass def distributed_optimizer(self, optimizer, strategy=None): """ dummy distributed optimizer Args: optimizer(None): fake optimizer strategy(None): fake strategy """ pass def save_inference_model(self): """ dummy save inference model """ pass def save_persistables(self): """ dummy save persistables """ pass os.environ["TRAINING_ROLE"] = "TRAINER" tmp = TmpFleet() tmp._role_maker = TmpClass() tmp.all_reduce_worker([], []) tmp.barrier_worker() from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker tmp = RoleMakerBase() tmp.all_gather(1) tmp.all_reduce_worker([], []) tmp.barrier_worker() tmp.barrier_all() from paddle.fluid.incubate.fleet.base.role_maker import \ MPISymetricRoleMaker tmp1 = MPISymetricRoleMaker() tmp1.all_gather(1) tmp1.all_gather(1) tmp2 = MPISymetricRoleMaker() tmp2.all_reduce_worker([], []) tmp3 = MPISymetricRoleMaker() tmp3.barrier_worker() tmp3.barrier_worker() tmp4 = MPISymetricRoleMaker() tmp4.barrier_all() tmp4.barrier_all()
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import \ fleet_embedding, _prepare_params, _fleet_embedding, \ _fleet_embedding_v2, FLEET_GLOBAL_DICT from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() global FLEET_GLOBAL_DICT with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) click = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) with fleet_embedding(click_name=click.name): emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) emb = fluid.layers.data_norm(input=emb, name="a", epsilon=1e-4, param_attr={ "batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4 }) fc = fluid.layers.fc(input=emb, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer( adam, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor" } }) adam.minimize([cost], [scope]) except: print("do not support pslib test, skip") return FLEET_GLOBAL_DICT["cur_accessor"] = "DownpourCtrAccessor" try: _prepare_params(input=show, size=[1, 1]) except: print("catch expected exception of param_attr=None") try: _prepare_params(input=show, size=[1, 1], param_attr=fluid.ParamAttr()) except: print("catch expected exception of name=None") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=1, param_attr=tmp) except: print("catch expected exception of size not list") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 12], param_attr=tmp) except: print("catch expected exception of size not equal") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp, is_sparse=False) except: print("catch expected exception of is_sparse=False") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp, \ is_sparse=True, is_distributed=False) except: print("catch expected exception of is_distributed=False") try: _prepare_params(input=show, size=[-1, 1], \ param_attr=fluid.ParamAttr(name="embedding"), \ is_sparse=True, is_distributed=True, dtype="abc") except: print("catch expected exception of unknown dtype") try: FLEET_GLOBAL_DICT["emb_to_accessor"]["embedding"] = "unknown" tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp) except: print("catch expected exception of unknown accessor") FLEET_GLOBAL_DICT["cur_accessor"] = "DownpourCtrAccessor" try: _fleet_embedding(input=show, size=[-1, 1], is_sparse=True, \ is_distributed=True, dtype="float32", \ param_attr=fluid.ParamAttr(name="embedding")) except: print("catch expected exception of unknown accessor") try: _fleet_embedding_v2(input=show, size=[-1, 1], is_sparse=True, \ is_distributed=True, dtype="float32", \ param_attr=fluid.ParamAttr(name="embedding")) except: print("catch expected exception of unknown accessor") adam1 = fluid.optimizer.Adam(learning_rate=0.000005) adam1 = fleet.distributed_optimizer( adam1, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor" } }) try: pre = FLEET_GLOBAL_DICT["emb_to_table"] FLEET_GLOBAL_DICT["emb_to_table"] = {} adam1.minimize([cost], [scope]) except: FLEET_GLOBAL_DICT["emb_to_table"] = pre print("catch expected exception of empty emb_to_table") try: pre = FLEET_GLOBAL_DICT["emb_to_table"] FLEET_GLOBAL_DICT["emb_to_table"] = {} FLEET_GLOBAL_DICT["emb_to_table"]["emb1"] = 0 adam1.minimize([cost], [scope]) except: FLEET_GLOBAL_DICT["emb_to_table"] = pre print("catch expected exception of error emb_to_table") try: adam2 = fluid.optimizer.Adam(learning_rate=0.000005) adam2 = fleet.distributed_optimizer(adam2) adam2.supported_embedding_types = [] adam2.minimize([cost], [scope]) except: print("catch expected exception of embedding_types") try: adam3 = fluid.optimizer.Adam(learning_rate=0.000005) adam3 = fleet.distributed_optimizer( adam3, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor", "sparse_embedx_dim": 999 } }) adam3.minimize([cost], [scope]) except: print("catch expected exception of embedx_dim error") try: adam4 = fluid.optimizer.Adam(learning_rate=0.000005) adam4 = fleet.distributed_optimizer( adam4, strategy={ "embedding": { "sparse_accessor_class": "DownpourCtrAccessor", "sparse_embedx_dim": 999 } }) adam4.minimize([cost], [scope]) except: print("catch expected exception of embedx_dim error") train_program1 = fluid.Program() startup_program1 = fluid.Program() FLEET_GLOBAL_DICT["emb_to_accessor"] = {} with fluid.program_guard(train_program1, startup_program1): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) with fleet_embedding(click_name=click.name): emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) with fleet_embedding(click_name=click.name): emb1 = fluid.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding"))
thread_stat_var_names += [ join_common_model.common_auc_stat_list[2].name, join_common_model.common_auc_stat_list[3].name ] thread_stat_var_names += [ update_model.auc_stat_list[2].name, update_model.auc_stat_list[2].name ] thread_stat_var_names += [ i.name for i in join_common_model.join_metric_list + join_common_model.common_metric_list + update_model.metric_list ] thread_stat_var_names = list(set(thread_stat_var_names)) config_fleet.config['stat_var_names'] = thread_stat_var_names adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam, strategy=config_fleet.config) # adam = fleet.distributed_optimizer(adam, strategy={"use_cvm" : True, "adjust_ins_weight" : adjust_ins_weight, "scale_datanorm" : 1e-4, "dump_slot": True, "stat_var_names": thread_stat_var_names}) # adam = fleet.distributed_optimizer(adam, strategy={"use_cvm" : True, "adjust_ins_weight" : adjust_ins_weight, "scale_datanorm" : 1e-4, "dump_slot": True, "stat_var_names": thread_stat_var_names, "fleet_desc_file": "reqi_fleet_desc"}) adam.minimize([join_common_model.joint_cost, update_model.avg_cost], [scope2, scope3]) join_common_model._train_program._fleet_opt["program_configs"][str( id(join_common_model.joint_cost.block.program))]["push_sparse"] = [] join_save_params = [ "join.batch_size", "join.batch_sum", "join.batch_square_sum", "join_0.w_0", "join_0.b_0", "join_1.w_0", "join_1.b_0", "join_2.w_0", "join_2.b_0", "join_3.w_0", "join_3.b_0", "join_4.w_0", "join_4.b_0", "join_5.w_0", "join_5.b_0", "join_6.w_0", "join_6.b_0", "join_7.w_0", "join_7.b_0" ]
def do_training(self, args=None): """do training""" avg_cost = self.net() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init() # optimizer optimizer = fluid.optimizer.Adam(learning_rate=0.001) # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 optimizer = fleet.distributed_optimizer(optimizer, strategy={"fleet_desc_file": "./thirdparty/pslib/fleet_desc.prototxt"}) optimizer.minimize(avg_cost) train_info = [] # 启动server if fleet.is_server(): fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): train_data_path = 'thirdparty/data/dist_data/pslib/train_data' train_data_files = [] for filename in os.listdir(train_data_path): train_data_files.append(os.path.join(train_data_path, filename)) # fleet dataset label = fluid.layers.data(name="click", shape=[-1, 1], dtype="int64", lod_level=1, append_batch_size=False) data = fluid.layers.data(name="1", shape=[1], dtype="int64", lod_level=1) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_use_var([label, data]) dataset.set_pipe_command("./python/bin/python ./thirdparty/pslib/dataset_generator.py") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist(train_data_files) # 把数据读到内存 dataset.load_into_memory() # 本地shuffle dataset.local_shuffle() # 初始化worker配置 fleet.init_worker() exe.run(fluid.default_startup_program()) PASS_NUM = 1 for pass_id in range(PASS_NUM): var_dict = {"loss": avg_cost} global var_dict class FetchVars(fluid.executor.FetchHandler): def __init__(self, var_dict=None, period_secs=2): super(FetchVars, self).__init__(var_dict, period_secs=2) def handler(self, res_dict): train_info.extend(res_dict["loss"]) print(train_info) exe.train_from_dataset( program=fluid.default_main_program(), dataset=dataset, fetch_handler=FetchVars(var_dict)) dataset.release_memory() fleet.shrink_sparse_table() fleet.shrink_dense_table(0.01, 11) fleet.print_table_stat(0) fleet.clear_one_table(0) fleet.clear_model() fleet.stop_worker() return train_info
def test_in_memory_dataset_run_fleet(self): """ Testcase for InMemoryDataset from create to run. """ with open("test_in_memory_dataset_run_fleet_a.txt", "w") as f: data = "1 1 1 2 2 3 3 4 5 5 5 5 1 1\n" data += "1 0 1 3 2 3 4 4 6 6 6 6 1 2\n" data += "1 1 1 4 2 3 5 4 7 7 7 7 1 3\n" f.write(data) with open("test_in_memory_dataset_run_fleet_b.txt", "w") as f: data = "1 0 1 5 2 3 3 4 5 5 5 5 1 4\n" data += "1 1 1 6 2 3 4 4 6 6 6 6 1 5\n" data += "1 0 1 7 2 3 5 4 7 7 7 7 1 6\n" data += "1 1 1 8 2 3 6 4 8 8 8 8 1 7\n" f.write(data) slots = ["click", "slot1", "slot2", "slot3", "slot4"] slots_vars = [] for slot in slots: var = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) click = slots_vars[0] embs = [] for slot in slots_vars[1:3]: with fleet_embedding(click_name=click.name): emb = fluid.layers.embedding(input=slot, size=[-1, 11], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) embs.append(emb) for slot in slots_vars[3:5]: with fleet_embedding(click_name=click.name): emb = fluid.embedding(input=slot, size=[-1, 11], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) emb = fluid.layers.reshape(emb, [-1, 11]) embs.append(emb) concat = fluid.layers.concat([embs[0], embs[3]], axis=1) fc = fluid.layers.fc(input=concat, size=1, act=None) label_cast = fluid.layers.cast(slots_vars[1], dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) cost = fluid.layers.mean(cost) try: fleet.init() adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) scope = fluid.Scope() adam.minimize([cost], [scope]) except: print("do not support pslib test, skip") return dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_batch_size(1) dataset.set_thread(2) dataset.set_filelist([ "test_in_memory_dataset_run_fleet_a.txt", "test_in_memory_dataset_run_fleet_b.txt" ]) dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) dataset.load_into_memory() exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) exe.train_from_dataset(fluid.default_main_program(), dataset) fleet._opt_info["stat_var_names"] = ["233"] exe.infer_from_dataset(fluid.default_main_program(), dataset) fleet._opt_info = None fleet._fleet_ptr = None os.remove("./test_in_memory_dataset_run_fleet_a.txt") os.remove("./test_in_memory_dataset_run_fleet_b.txt")