def train_loop(args, train_program, feed_vars, loss, auc_var, batch_auc_var, trainer_num, trainer_id): dataset = reader.CriteoDataset(args.sparse_feature_dim) train_reader = paddle.batch(paddle.reader.shuffle( dataset.train([args.train_data_path], trainer_num, trainer_id), buf_size=args.batch_size * 100), batch_size=args.batch_size) feed_var_names = [var.name for var in feed_vars] place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) total_time = 0 pass_id = 0 batch_id = 0 feeder = fluid.DataFeeder(feed_var_names, place) for data in train_reader(): loss_val, auc_val, batch_auc_val = exe.run( fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss.name, auc_var.name, batch_auc_var.name]) break loss_val = np.mean(loss_val) auc_val = np.mean(auc_val) batch_auc_val = np.mean(batch_auc_val) logger.info( "TRAIN --> pass: {} batch: {} loss: {} auc: {}, batch_auc: {}".format( pass_id, batch_id, loss_val / args.batch_size, auc_val, batch_auc_val))
def train_loop(args, train_program, data_list, loss, auc_var, batch_auc_var, trainer_num, trainer_id): dataset = reader.CriteoDataset(args.sparse_feature_dim) train_reader = paddle.batch(paddle.reader.shuffle( dataset.train([args.train_data_path], trainer_num, trainer_id), buf_size=args.batch_size * 100), batch_size=args.batch_size) place = fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=data_list, place=place) data_name_list = [var.name for var in data_list] exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) for pass_id in range(args.num_passes): for batch_id, data in enumerate(train_reader()): loss_val, auc_val, batch_auc_val = exe.run( train_program, feed=feeder.feed(data), fetch_list=[loss, auc_var, batch_auc_var]) logger.info( "TRAIN --> pass: {} batch: {} loss: {} auc: {}, batch_auc: {}". format(pass_id, batch_id, loss_val / args.batch_size, auc_val, batch_auc_val)) if batch_id % 1000 == 0 and batch_id != 0: model_dir = args.model_output_dir + '/batch-' + str(batch_id) if args.trainer_id == 0: fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe) model_dir = args.model_output_dir + '/pass-' + str(pass_id) if args.trainer_id == 0: fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe)
def infer(): args = parse_args() place = fluid.CPUPlace() inference_scope = fluid.Scope() dataset = reader.CriteoDataset(args.sparse_feature_dim) test_reader = paddle.batch(dataset.test([args.data_path]), batch_size=args.batch_size) startup_program = fluid.framework.Program() test_program = fluid.framework.Program() with fluid.scope_guard(inference_scope): with fluid.framework.program_guard(test_program, startup_program): loss, auc_var, batch_auc_var, _, data_list, auc_states = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim, False) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=data_list, place=place) fluid.io.load_persistables( executor=exe, dirname=args.model_path, main_program=fluid.default_main_program()) def set_zero(var_name): param = inference_scope.var(var_name).get_tensor() param_array = np.zeros(param._get_dims()).astype("int64") param.set(param_array, place) for var in auc_states: set_zero(var.name) for batch_id, data in enumerate(test_reader()): loss_val, auc_val = exe.run(test_program, feed=feeder.feed(data), fetch_list=[loss, auc_var]) if batch_id % 100 == 0: logger.info("TEST --> batch: {} loss: {} auc: {}".format( batch_id, loss_val / args.batch_size, auc_val))
def infer(): args = parse_args() place = fluid.CPUPlace() inference_scope = fluid.core.Scope() dataset = reader.CriteoDataset(args.sparse_feature_dim) test_reader = paddle.batch(dataset.test([args.data_path]), batch_size=args.batch_size) startup_program = fluid.framework.Program() test_program = fluid.framework.Program() with fluid.framework.program_guard(test_program, startup_program): loss, data_list, auc_var, batch_auc_var = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=data_list, place=place) with fluid.scope_guard(inference_scope): [inference_program, _, fetch_targets] = fluid.io.load_inference_model(args.model_path, exe) def set_zero(var_name): param = inference_scope.var(var_name).get_tensor() param_array = np.zeros(param._get_dims()).astype("int64") param.set(param_array, place) auc_states_names = ['_generated_var_2', '_generated_var_3'] for name in auc_states_names: set_zero(name) for batch_id, data in enumerate(test_reader()): loss_val, auc_val = exe.run(inference_program, feed=feeder.feed(data), fetch_list=fetch_targets) if batch_id % 100 == 0: logger.info("TEST --> batch: {} loss: {} auc: {}".format( batch_id, loss_val / args.batch_size, auc_val))
def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var, trainer_num, trainer_id): if args.enable_ce: SEED = 102 train_program.random_seed = SEED fluid.default_startup_program().random_seed = SEED dataset = reader.CriteoDataset(args.sparse_feature_dim) train_reader = paddle.batch(paddle.reader.shuffle( dataset.train([args.train_data_path], trainer_num, trainer_id), buf_size=args.batch_size * 100), batch_size=args.batch_size) py_reader.decorate_paddle_reader(train_reader) data_name_list = [] place = fluid.CPUPlace() exe = fluid.Executor(place) exec_strategy = fluid.ExecutionStrategy() build_strategy = fluid.BuildStrategy() if os.getenv("NUM_THREADS", ""): exec_strategy.num_threads = int(os.getenv("NUM_THREADS")) cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) build_strategy.reduce_strategy = \ fluid.BuildStrategy.ReduceStrategy.Reduce if cpu_num > 1 \ else fluid.BuildStrategy.ReduceStrategy.AllReduce exe.run(fluid.default_startup_program()) pe = fluid.ParallelExecutor(use_cuda=False, loss_name=loss.name, main_program=train_program, build_strategy=build_strategy, exec_strategy=exec_strategy) total_time = 0 for pass_id in range(args.num_passes): pass_start = time.time() batch_id = 0 py_reader.start() try: while True: loss_val, auc_val, batch_auc_val = pe.run( fetch_list=[loss.name, auc_var.name, batch_auc_var.name]) loss_val = np.mean(loss_val) auc_val = np.mean(auc_val) batch_auc_val = np.mean(batch_auc_val) logger.info( "TRAIN --> pass: {} batch: {} loss: {} auc: {}, batch_auc: {}" .format(pass_id, batch_id, loss_val / args.batch_size, auc_val, batch_auc_val)) if batch_id % 1000 == 0 and batch_id != 0: model_dir = args.model_output_dir + '/batch-' + str( batch_id) if args.trainer_id == 0: fluid.io.save_persistables( executor=exe, dirname=model_dir, main_program=fluid.default_main_program()) batch_id += 1 except fluid.core.EOFException: py_reader.reset() print("pass_id: %d, pass_time_cost: %f" % (pass_id, time.time() - pass_start)) total_time += time.time() - pass_start model_dir = args.model_output_dir + '/pass-' + str(pass_id) if args.trainer_id == 0: fluid.io.save_persistables( executor=exe, dirname=model_dir, main_program=fluid.default_main_program()) # only for ce if args.enable_ce: threads_num, cpu_num = get_cards(args) epoch_idx = args.num_passes print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % (cpu_num, threads_num, total_time / epoch_idx)) print("kpis\ttrain_loss_cpu%s_thread%s\t%s" % (cpu_num, threads_num, loss_val / args.batch_size)) print("kpis\ttrain_auc_val_cpu%s_thread%s\t%s" % (cpu_num, threads_num, auc_val)) print("kpis\ttrain_batch_auc_val_cpu%s_thread%s\t%s" % (cpu_num, threads_num, batch_auc_val))
def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var, trainer_num, trainer_id): dataset = reader.CriteoDataset(args.sparse_feature_dim) train_reader = paddle.batch(paddle.reader.shuffle( dataset.train([args.train_data_path], trainer_num, trainer_id), buf_size=args.batch_size * 100), batch_size=args.batch_size) py_reader.decorate_paddle_reader(train_reader) data_name_list = [] place = fluid.CPUPlace() exe = fluid.Executor(place) exec_strategy = fluid.ExecutionStrategy() build_strategy = fluid.BuildStrategy() if os.getenv("NUM_THREADS", ""): exec_strategy.num_threads = int(os.getenv("NUM_THREADS")) cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) build_strategy.reduce_strategy = \ fluid.BuildStrategy.ReduceStrategy.Reduce if cpu_num > 1 \ else fluid.BuildStrategy.ReduceStrategy.AllReduce pe = fluid.ParallelExecutor(use_cuda=False, loss_name=loss.name, main_program=train_program, build_strategy=build_strategy, exec_strategy=exec_strategy) exe.run(fluid.default_startup_program()) for pass_id in range(args.num_passes): pass_start = time.time() batch_id = 0 py_reader.start() try: while True: loss_val, auc_val, batch_auc_val = pe.run( fetch_list=[loss.name, auc_var.name, batch_auc_var.name]) loss_val = np.mean(loss_val) auc_val = np.mean(auc_val) batch_auc_val = np.mean(batch_auc_val) logger.info( "TRAIN --> pass: {} batch: {} loss: {} auc: {}, batch_auc: {}" .format(pass_id, batch_id, loss_val / args.batch_size, auc_val, batch_auc_val)) if batch_id % 1000 == 0 and batch_id != 0: model_dir = args.model_output_dir + '/batch-' + str( batch_id) if args.trainer_id == 0: fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe) batch_id += 1 except fluid.core.EOFException: py_reader.reset() print("pass_id: %d, pass_time_cost: %f" % (pass_id, time.time() - pass_start)) model_dir = args.model_output_dir + '/pass-' + str(pass_id) if args.trainer_id == 0: fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe)