def save_inference_model(): save_interval = envs.get_global_env( "save.inference.epoch_interval", -1, namespace) if not need_save(epoch_id, save_interval, False): return feed_varnames = envs.get_global_env("save.inference.feed_varnames", None, namespace) fetch_varnames = envs.get_global_env( "save.inference.fetch_varnames", None, namespace) if feed_varnames is None or fetch_varnames is None: return fetch_vars = [ fluid.default_main_program().global_block().vars[varname] for varname in fetch_varnames ] dirname = envs.get_global_env("save.inference.dirname", None, namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) if is_fleet: fleet.save_inference_model(self._exe, dirname, feed_varnames, fetch_vars) else: fluid.io.save_inference_model(dirname, feed_varnames, fetch_vars, self._exe) self.inference_models.append((epoch_id, dirname))
def save_inference_model(): save_interval = envs.get_global_env( "save.inference.epoch_interval", -1, namespace) if not need_save(epoch_id, save_interval, False): return print("save inference model is not supported now.") return feed_varnames = envs.get_global_env("save.inference.feed_varnames", None, namespace) fetch_varnames = envs.get_global_env( "save.inference.fetch_varnames", None, namespace) fetch_vars = [ fluid.global_scope().vars[varname] for varname in fetch_varnames ] dirname = envs.get_global_env("save.inference.dirname", None, namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) if is_fleet: fleet.save_inference_model(dirname, feed_varnames, fetch_vars) else: fluid.io.save_inference_model(dirname, feed_varnames, fetch_vars, self._exe) self.inference_models.append((epoch_id, dirname))
def save_model(self, FLAGS, net_output, global_step): """ save model """ if (global_step != "final" and global_step % FLAGS.save_model_steps != 0) \ or not fleet.is_first_worker(): return path = "%s/checkpoint_%s" % (FLAGS.train_dir, global_step) fleet.save_inference_model(self.paddle_env['exe'], path, net_output['model_output']['feeded_var_names'], net_output['model_output']['fetch_targets']) #or fleet.save_persistables(self.paddle_env['exe'], path) self.record_checkpoint(FLAGS, global_step)
exe.train_from_dataset( program=fluid.default_main_program(), dataset=dataset, fetch_handler=FH([auc_var.name], 10, True), # fetch_list=[auc_var], # fetch_info=["auc"], debug=False) path = "./saved_models/" + current_date_hr.strftime( DATE_TIME_STRING_FORMAT) + "_model/" logger.info("save inference program: " + path) if len(y_auc) <= 1: logger.info("Current AUC: " + str(y_auc[-1])) else: logger.info("Dataset is too small, cannot get AUC.") fetch_list = fleet.save_inference_model( exe, path, [x.name for x in sparse_input_ids] + [label.name], [auc_var]) os.system("hadoop fs -D hadoop.job.ugi=" + hdfs_ugi + " -D fs.defaultFS=" + hdfs_address + " -put -f " + path + " " + os.path.join( dataset_prefix, current_date_hr.strftime( DATE_TIME_STRING_FORMAT).split("/")[0]) + " >/dev/null 2>&1") os.system('touch donefile') os.system( "hadoop fs -D hadoop.job.ugi=" + hdfs_ugi + " -D fs.defaultFS=" + hdfs_address + " -put -f donefile" + " " + os.path.join( dataset_prefix, current_date_hr.strftime(DATE_TIME_STRING_FORMAT) +
def train(use_cuda, save_dirname, is_local, is_increment): """ train """ # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model() old_model = None model_args = model() predict = model_args['predict'] avg_cost = model_args['avg_cost'] feed_order = model_args['feed_order'] loader = model_args['loader'] auc_batch = model_args['auc'][1] # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 sgd_optimizer = AdamOptimizer(learning_rate=2e-4) # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5) if is_local: sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) readers = [] for i in range(16): readers.append(data_reader(cluster_train_dir)) multi_readers = paddle.reader.multiprocess_reader(readers) loader.set_sample_generator( multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM)) # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM)) # feeder = fluid.DataFeeder(feed_order, place) # train_reader = feeder.decorate_reader( # paddle.batch(paddle.reader.shuffle( # data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE), # multi_devices=False, drop_last=True) start_program = fluid.default_startup_program() exe.run(start_program) main_prog = fluid.default_main_program() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM * 2 build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster build_strategy.fuse_broadcast_ops = True # build_strategy.async_mode = True main_program = fluid.CompiledProgram(main_prog).with_data_parallel( loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy) #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM)) if is_increment: # load model to fine-tune fluid.io.load_params(exe, old_model, main_program) for auc_state in model_args['auc'][2]: set_zero(place, fluid.global_scope(), auc_state.name) # 并行训练,速度更快 # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda, # main_program=main_program, loss_name=avg_cost.name, # exec_strategy=exec_strategy, build_strategy=build_strategy) cost_list = [] auc_list = [] import time pass_s_time = time.time() for pass_id in range(PASS_NUM): s_time = time.time() for batch_id, data in enumerate(loader()): r_time = time.time() - s_time st_time = time.time() cost_value, auc_value = exe.run( program=main_program, feed=data, fetch_list=[avg_cost.name, auc_batch.name]) t_time = time.time() - st_time cost_list.append(np.array(cost_value)) auc_list.append(np.array(auc_value)) if batch_id % 10 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \ (pass_id, batch_id, np.array(cost_list).mean(), np.array(auc_list).mean(), r_time, t_time) cost_list = [] auc_list = [] if batch_id % 1000 == 0: if save_dirname is not None: fluid.io.save_inference_model( save_dirname, feed_order, [predict, avg_cost, auc_batch], exe ) fluid.io.save_persistables(exe, save_dirname) infer(cluster_test_dir, save_dirname, feed_order) s_time = time.time() pass_time = time.time() - pass_s_time print("Pass train time: %f" % pass_time) else: role = role_maker.PaddleCloudRoleMaker() # 全异步训练 config = DistributeTranspilerConfig() config.sync_mode = False config.runtime_split_send_recv = True # 加入 fleet init 初始化环境 fleet.init(role) optimizer = fleet.distributed_optimizer(sgd_optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): # 初始化worker配置 fleet.init_worker() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader( paddle.batch(paddle.reader.shuffle( data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) exe.run(fleet.startup_program) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True if CPU_NUM > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce compiled_prog = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) for pass_id in range(PASS_NUM): cost_list = [] auc_list = [] import time s_time = time.time() for batch_id, data in enumerate(train_reader()): r_time = time.time() - s_time cost_value, auc_value = exe.run( program=compiled_prog, feed=data, fetch_list=[avg_cost.name, auc_batch.name]) t_time = time.time() - r_time cost_list.append(np.array(cost_value)) auc_list.append(np.array(auc_value)) if batch_id % 10 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \ (pass_id, batch_id, np.array(cost_list).mean(), np.array(auc_list).mean(), r_time, t_time) cost_list = [] auc_list = [] if batch_id % 1000 == 0 and fleet.is_first_worker(): if save_dirname is not None: fleet.save_inference_model( exe, save_dirname, feed_order, [predict, avg_cost, auc_batch] ) fleet.save_persistables(exe, save_dirname) infer(cluster_test_dir, save_dirname, feed_order) s_time = time.time() fleet.stop_worker()
def save(predict,savaPath,exe): if not os.path.exists(savaPath): os.makedirs(savaPath) print('save models to %s' % (savaPath)) fleet.save_inference_model(dirname=savaPath, feeded_var_names=['images'],target_vars=[predict], executor=exe)
def train(use_cuda, train_sample_dir, test_sample_dir, old_model, output_model, is_local, is_increment): """ train """ # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model() model_args = model() navi_predict = model_args['predict'][0] voice_navi_predict = model_args['predict'][1] speed_navi_predict = model_args['predict'][2] avg_cost = model_args['avg_cost'] feed_order = model_args['feed_order'] role = role_maker.PaddleCloudRoleMaker() # 全异步训练 config = DistributeTranspilerConfig() config.sync_mode = False config.runtime_split_send_recv = True sgd_optimizer = AdamOptimizer(learning_rate=2e-4) if is_local: sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) # train_reader = paddle.batch( # paddle.reader.shuffle( # streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader(paddle.batch( paddle.reader.shuffle(streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) start_program = fluid.default_startup_program() exe.run(start_program) main_program = fluid.default_main_program() if is_increment: # load model to fine-tune fluid.io.load_params(exe, old_model, main_program) # for auc_state in model_args['auc'][2]: # set_zero(place, fluid.global_scope(), auc_state.name) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM main_program.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True # 并行训练,速度更快 train_pe = fluid.ParallelExecutor(use_cuda=use_cuda, main_program=main_program, loss_name=avg_cost.name) cost_list = [] for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): cost_value = train_pe.run(feed=data, fetch_list=[avg_cost.name]) cost_list.append(np.array(cost_value)) if batch_id % 100 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s" % \ (pass_id, batch_id, np.array(cost_list).mean()) cost_list = [] if batch_id % 2000 == 0: if output_model is not None: fluid.io.save_inference_model( output_model, feed_order, [ navi_predict, voice_navi_predict, speed_navi_predict, avg_cost ], exe) fluid.io.save_persistables(exe, output_model) infer(test_sample_dir, output_model, feed_order) else: # 加入 fleet init 初始化环境 fleet.init(role) # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 optimizer = fleet.distributed_optimizer(sgd_optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): if is_increment: fleet.init_server(old_model) else: fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): # 初始化worker配置 fleet.init_worker() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) # train_reader = paddle.batch( # paddle.reader.shuffle( # data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader(paddle.batch( paddle.reader.shuffle(data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) exe.run(fleet.startup_program) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True if CPU_NUM > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce compiled_prog = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) cost_list = [] for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): cost_value = exe.run(program=compiled_prog, feed=data, fetch_list=[avg_cost.name]) cost_list.append(np.array(cost_value)) if batch_id % 100 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s" % \ (pass_id, batch_id, np.array(cost_list).mean()) cost_list = [] if batch_id % 1000 == 0 and fleet.is_first_worker(): if output_model is not None: fleet.save_inference_model( exe, output_model, feed_order, [ navi_predict, voice_navi_predict, speed_navi_predict, avg_cost ]) fleet.save_persistables(exe, output_model) infer(test_sample_dir, output_model, feed_order) fleet.stop_worker()