def _get_dataset(self): namespace = "train.reader" inputs = self.model.get_inputs() threads = int(envs.get_runtime_environ("train.trainer.threads")) batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", self._config_yaml) train_data_path = envs.get_global_env("train_data_path", None, namespace) if train_data_path.startswith("fleetrec::"): package_base = envs.get_runtime_environ("PACKAGE_BASE") assert package_base is not None train_data_path = os.path.join(package_base, train_data_path.split("::")[1]) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command(pipe_cmd) dataset.set_batch_size(batch_size) dataset.set_thread(threads) file_list = [ os.path.join(train_data_path, x) for x in os.listdir(train_data_path) ] dataset.set_filelist(file_list) return dataset
def optimizer(self): learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) optimizer = envs.get_global_env("hyper_parameters.optimizer", None, self._namespace) return self._build_optimizer(optimizer, learning_rate)
def save_inference_model(): save_interval = envs.get_global_env( "save.inference.epoch_interval", -1, namespace) if not need_save(epoch_id, save_interval, False): return print("save inference model is not supported now.") return feed_varnames = envs.get_global_env("save.inference.feed_varnames", None, namespace) fetch_varnames = envs.get_global_env( "save.inference.fetch_varnames", None, namespace) fetch_vars = [ fluid.global_scope().vars[varname] for varname in fetch_varnames ] dirname = envs.get_global_env("save.inference.dirname", None, namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) if is_fleet: fleet.save_inference_model(dirname, feed_varnames, fetch_vars) else: fluid.io.save_inference_model(dirname, feed_varnames, fetch_vars, self._exe) self.inference_models.append((epoch_id, dirname))
def _get_dataloader(self): namespace = "train.reader" dataloader = self.model._data_loader batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) reader = dataloader_instance.dataloader(reader_class, "TRAIN", self._config_yaml) dataloader.set_sample_generator(reader, batch_size) return dataloader
def net(self): is_distributed = True if envs.get_trainer() == "CtrTrainer" else False sparse_feature_number = envs.get_global_env( "hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_dim = envs.get_global_env( "hyper_parameters.sparse_feature_dim", None, self._namespace) def embedding_layer(input): emb = fluid.layers.embedding( input=input, is_sparse=True, is_distributed=is_distributed, size=[sparse_feature_number, sparse_feature_dim], param_attr=fluid.ParamAttr( name="SparseFeatFactors", initializer=fluid.initializer.Uniform()), ) emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum') return emb_sum def fc(input, output_size): output = fluid.layers.fc( input=input, size=output_size, act='relu', param_attr=fluid.ParamAttr( initializer=fluid.initializer.Normal( scale=1.0 / math.sqrt(input.shape[1])))) return output sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs)) concated = fluid.layers.concat(sparse_embed_seq + [self.dense_input], axis=1) fcs = [concated] hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) for size in hidden_layers: fcs.append(fc(fcs[-1], size)) predict = fluid.layers.fc( input=fcs[-1], size=2, act="softmax", param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( scale=1 / math.sqrt(fcs[-1].shape[1])))) self.predict = predict
def dense_input(): dim = envs.get_global_env("hyper_parameters.dense_input_dim", None, self._namespace) dense_input_var = fluid.layers.data(name="D", shape=[dim], dtype="float32") return dense_input_var
def save_persistables(): save_interval = envs.get_global_env( "save.increment.epoch_interval", -1, namespace) if not need_save(epoch_id, save_interval, False): return dirname = envs.get_global_env("save.increment.dirname", None, namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) if is_fleet: fleet.save_persistables(self._exe, dirname) else: fluid.io.save_persistables(self._exe, dirname) self.increment_models.append((epoch_id, dirname))
def sparse_inputs(): ids = envs.get_global_env("hyper_parameters.sparse_inputs_slots", None, self._namespace) sparse_input_ids = [ fluid.layers.data(name="S" + str(i), shape=[1], lod_level=1, dtype="int64") for i in range(1, ids) ] return sparse_input_ids
def init(self): self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.cont_max_ = [ 20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50 ] self.cont_diff_ = [ 20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50 ] self.hash_dim_ = envs.get_global_env( "hyper_parameters.sparse_feature_number", None, "train.model") self.continuous_range_ = range(1, 14) self.categorical_range_ = range(14, 40)
def user_define_engine(engine_yaml): with open(engine_yaml, 'r') as rb: _config = yaml.load(rb.read(), Loader=yaml.FullLoader) assert _config is not None envs.set_runtime_environs(_config) train_location = envs.get_global_env("engine.file") train_dirname = os.path.dirname(train_location) base_name = os.path.splitext(os.path.basename(train_location))[0] sys.path.append(train_dirname) trainer_class = envs.lazy_instance_by_fliename(base_name, "UserDefineTraining") return trainer_class
def dataset_train(self, context): # run startup program at once self._exe.run(fluid.default_startup_program()) dataset = self._get_dataset() epochs = envs.get_global_env("train.epochs") for i in range(epochs): self._exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=self.fetch_vars, fetch_info=self.fetch_alias, print_period=self.fetch_period) self.save(i, "train", is_fleet=False) context['status'] = 'infer_pass'
def dataloader(readerclass, train, yaml_file): namespace = "train.reader" if train == "TRAIN": reader_name = "TrainReader" data_path = get_global_env("train_data_path", None, namespace) else: reader_name = "EvaluateReader" data_path = get_global_env("test_data_path", None, namespace) if data_path.startswith("fleetrec::"): package_base = get_runtime_environ("PACKAGE_BASE") assert package_base is not None data_path = os.path.join(package_base, data_path.split("::")[1]) files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] reader_class = lazy_instance_by_fliename(readerclass, reader_name) reader = reader_class(yaml_file) reader.init() def gen_reader(): for file in files: with open(file, 'r') as f: for line in f: line = line.rstrip('\n') iter = reader.generate_sample(line) for parsed_line in iter(): if parsed_line is None: continue else: values = [] for pased in parsed_line: values.append(pased[1]) yield values return gen_reader
def train(self, context): # run startup program at once self._exe.run(fluid.default_startup_program()) dataset = self._get_dataset() epochs = envs.get_global_env("train.epochs") for i in range(epochs): self._exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=self.fetch_vars, fetch_info=self.fetch_alias, print_period=self.fetch_period) context['is_exit'] = True
def dataset_train(self, context): self._exe.run(fleet.startup_program) fleet.init_worker() dataset = self._get_dataset() epochs = envs.get_global_env("train.epochs") for i in range(epochs): self._exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=self.fetch_vars, fetch_info=self.fetch_alias, print_period=self.fetch_period) self.save(i, "train", is_fleet=True) fleet.stop_worker() context['status'] = 'terminal_pass'
def dataloader_train(self, context): self._exe.run(fleet.startup_program) fleet.init_worker() reader = self._get_dataloader() epochs = envs.get_global_env("train.epochs") program = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=self.model.get_cost_op().name, build_strategy=self.strategy.get_build_strategy(), exec_strategy=self.strategy.get_execute_strategy()) metrics_varnames = [] metrics_format = [] metrics_format.append("{}: {{}}".format("epoch")) metrics_format.append("{}: {{}}".format("batch")) for name, var in self.model.get_metrics().items(): metrics_varnames.append(var.name) metrics_format.append("{}: {{}}".format(name)) metrics_format = ", ".join(metrics_format) for epoch in range(epochs): reader.start() batch_id = 0 try: while True: metrics_rets = self._exe.run( program=program, fetch_list=metrics_varnames) metrics = [epoch, batch_id] metrics.extend(metrics_rets) if batch_id % 10 == 0 and batch_id != 0: print(metrics_format.format(*metrics)) batch_id += 1 except fluid.core.EOFException: reader.reset() fleet.stop_worker() context['status'] = 'terminal_pass'
def build_strategy(self): mode = envs.get_runtime_environ("train.trainer.strategy") assert mode in ["async", "geo", "sync", "half_async"] strategy = None if mode == "async": strategy = StrategyFactory.create_async_strategy() elif mode == "geo": push_num = envs.get_global_env("train.strategy.mode.push_num", 100) strategy = StrategyFactory.create_geo_strategy(push_num) elif mode == "sync": strategy = StrategyFactory.create_sync_strategy() elif mode == "half_async": strategy = StrategyFactory.create_half_async_strategy() assert strategy is not None self.strategy = strategy return strategy
def instance(self, context): models = envs.get_global_env("train.model.models") model_class = envs.lazy_instance_by_fliename(models, "Model") self.model = model_class(None) context['status'] = 'init_pass'
def optimizer(self): learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) return optimizer