def train(): args = parse_args() if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, data_list, auc_var, batch_auc_var = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim) optimizer = fluid.optimizer.Adam(learning_rate=1e-4) optimizer.minimize(loss) if args.is_local: logger.info("run local training") main_program = fluid.default_main_program() train_loop(args, main_program, data_list, loss, auc_var, batch_auc_var, 1, 0) else: logger.info("run dist training") t = fluid.DistributeTranspiler() t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver": logger.info("run pserver") prog = t.get_pserver_program(args.current_endpoint) startup = t.get_startup_program(args.current_endpoint, pserver_program=prog) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup) exe.run(prog) elif args.role == "trainer": logger.info("run trainer") train_prog = t.get_trainer_program() train_loop(args, train_prog, data_list, loss, auc_var, batch_auc_var, args.trainers, args.trainer_id)
def train(): args = parse_args() if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, auc_var, batch_auc_var, py_reader, _ = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim) optimizer = fluid.optimizer.Adam(learning_rate=1e-4) optimizer.minimize(loss) if args.cloud_train: # the port of all pservers, needed by both trainer and pserver port = os.getenv("PADDLE_PORT", "6174") # comma separated ips of all pservers, needed by trainer and pserver_ips = os.getenv("PADDLE_PSERVERS", "") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) args.endpoints = ",".join(eplist) args.trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) args.current_endpoint = os.getenv("POD_IP", "localhost") + ":" + port args.role = os.getenv("TRAINING_ROLE", "TRAINER") args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) args.is_local = bool(int(os.getenv("PADDLE_IS_LOCAL", 0))) if args.is_local: logger.info("run local training") main_program = fluid.default_main_program() train_loop(args, main_program, py_reader, loss, auc_var, batch_auc_var, 1, 0) else: logger.info("run dist training") t = fluid.DistributeTranspiler() t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver" or args.role == "PSERVER": logger.info("run pserver") prog = t.get_pserver_program(args.current_endpoint) startup = t.get_startup_program(args.current_endpoint, pserver_program=prog) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup) exe.run(prog) elif args.role == "trainer" or args.role == "TRAINER": logger.info("run trainer") train_prog = t.get_trainer_program() train_loop(args, train_prog, py_reader, loss, auc_var, batch_auc_var, args.trainers, args.trainer_id) else: raise ValueError( 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' )
def dump(): args = parse_args() output_data_path = os.path.abspath(args.output_data_path) base_datafile = output_data_path + "/" + NOW_DATETIME + "/base/feature" base_donefile = output_data_path + "/" + "donefile/" + "base.txt" patch_datafile = output_data_path + "/" + NOW_DATETIME + "/patch/feature" patch_donefile = output_data_path + "/" + "donefile/" + "patch.txt" place = fluid.CPUPlace() inference_scope = fluid.Scope() startup_program = fluid.framework.Program() test_program = fluid.framework.Program() with fluid.framework.program_guard(test_program, startup_program): loss, auc_var, batch_auc_var, _, data_list = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim, False) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=data_list, place=place) fluid.io.load_persistables(executor=exe, dirname=args.model_path, main_program=fluid.default_main_program()) # Dump embedding t = np.array( fluid.global_scope().find_var('SparseFeatFactors').get_tensor()) if not os.access(os.path.dirname(base_datafile), os.F_OK): os.makedirs(os.path.dirname(base_datafile)) with open(base_datafile, "wb") as f: writer = SequenceFileWriter(f) for i in range(0, t.shape[0]): key_bytes = struct.pack('Q', i) row_bytes = struct.pack('%sf' % t.shape[1], *t[i]) writer.write(key_bytes, row_bytes) f.close() write_donefile(base_datafile, base_donefile)
def infer(): args = parse_args() place = fluid.CPUPlace() inference_scope = fluid.Scope() dataset = reader.CriteoDataset(args.sparse_feature_dim) test_reader = paddle.batch(dataset.test([args.data_path]), batch_size=args.batch_size) startup_program = fluid.framework.Program() test_program = fluid.framework.Program() with fluid.scope_guard(inference_scope): with fluid.framework.program_guard(test_program, startup_program): loss, auc_var, batch_auc_var, _, data_list, auc_states = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim, False) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=data_list, place=place) fluid.io.load_persistables( executor=exe, dirname=args.model_path, main_program=fluid.default_main_program()) def set_zero(var_name): param = inference_scope.var(var_name).get_tensor() param_array = np.zeros(param._get_dims()).astype("int64") param.set(param_array, place) for var in auc_states: set_zero(var.name) for batch_id, data in enumerate(test_reader()): loss_val, auc_val = exe.run(test_program, feed=feeder.feed(data), fetch_list=[loss, auc_var]) if batch_id % 100 == 0: logger.info("TEST --> batch: {} loss: {} auc: {}".format( batch_id, loss_val / args.batch_size, auc_val))
def infer(): args = parse_args() place = fluid.CPUPlace() inference_scope = fluid.core.Scope() dataset = reader.CriteoDataset(args.sparse_feature_dim) test_reader = paddle.batch(dataset.test([args.data_path]), batch_size=args.batch_size) startup_program = fluid.framework.Program() test_program = fluid.framework.Program() with fluid.framework.program_guard(test_program, startup_program): loss, data_list, auc_var, batch_auc_var = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=data_list, place=place) with fluid.scope_guard(inference_scope): [inference_program, _, fetch_targets] = fluid.io.load_inference_model(args.model_path, exe) def set_zero(var_name): param = inference_scope.var(var_name).get_tensor() param_array = np.zeros(param._get_dims()).astype("int64") param.set(param_array, place) auc_states_names = ['_generated_var_2', '_generated_var_3'] for name in auc_states_names: set_zero(name) for batch_id, data in enumerate(test_reader()): loss_val, auc_val = exe.run(inference_program, feed=feeder.feed(data), fetch_list=fetch_targets) if batch_id % 100 == 0: logger.info("TEST --> batch: {} loss: {} auc: {}".format( batch_id, loss_val / args.batch_size, auc_val))