def train(): """ do training """ args = parse_args() if args.enable_ce: fluid.default_startup_program().random_seed = SEED fluid.default_main_program().random_seed = SEED train_dir = args.train_dir vocab_text_path = args.vocab_text_path vocab_tag_path = args.vocab_tag_path use_cuda = True if args.use_cuda else False parallel = True if args.parallel else False batch_size = args.batch_size neg_size = args.neg_size print("use_cuda: {}, parallel: {}, batch_size: {}, neg_size: {} ".format( use_cuda, parallel, batch_size, neg_size)) vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data( file_dir=train_dir, vocab_text_path=vocab_text_path, vocab_tag_path=vocab_tag_path, neg_size=neg_size, batch_size=batch_size * get_cards(args), buffer_size=batch_size * 100, is_train=True) """ train network """ # Train program avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size) # Optimization to minimize lost sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr) sgd_optimizer.minimize(avg_cost) # Initialize executor place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name) else: train_exe = exe pass_num = args.pass_num model_dir = args.model_dir fetch_list = [avg_cost.name] total_time = 0.0 ce_info = [] for pass_idx in range(pass_num): epoch_idx = pass_idx + 1 print("epoch_%d start" % epoch_idx) t0 = time.time() for batch_id, data in enumerate(train_reader()): lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place) lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place) lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place) loss_val, correct_val = train_exe.run( feed={ "text": lod_text_seq, "pos_tag": lod_pos_tag, "neg_tag": lod_neg_tag }, fetch_list=[avg_cost.name, correct.name]) ce_info.append( float(np.sum(correct_val)) / (args.num_devices * batch_size)) if batch_id % args.print_batch == 0: print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}". format( pass_idx, (batch_id + 10) * batch_size, np.mean(loss_val), float(np.sum(correct_val)) / (args.num_devices * batch_size))) t1 = time.time() total_time += t1 - t0 print("epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, batch_id, total_time / epoch_idx)) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) feed_var_names = ["text", "pos_tag"] fetch_vars = [cos_pos] fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) # only for ce if args.enable_ce: ce_acc = 0 try: ce_acc = ce_info[-2] except: logger.error("ce info error") epoch_idx = args.pass_num device = get_device(args) if args.use_cuda: gpu_num = device[1] print("kpis\teach_pass_duration_gpu%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc)) else: cpu_num = device[1] threads_num = device[2] print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % (cpu_num, threads_num, total_time / epoch_idx)) print("kpis\ttrain_acc_cpu%s_thread%s\t%s" % (cpu_num, threads_num, ce_acc)) print("finish training")
def train(model_dir=model_path,pretrained_model=pretrained_model): image=fluid.layers.data(name='image',shape=[3,112,96],dtype='float32') label=fluid.layers.data(name='label',shape=[1],dtype='int64') out1=sphere20_net(image,embedding_dim) out2,cost=total_loss(out1,label,lmbda=0.01,alpha=0.1) acc_top1=fluid.layers.accuracy(input=out2,label=label,k=1) acc_top5=fluid.layers.accuracy(input=out2,label=label,k=5) test_program = fluid.default_main_program().clone(for_test=True) step = int(total_images / Batch_size + 1) bd = [step * e for e in Epochs] lr = [] lr = [base_lr * (base_lr_decay**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum(learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(6e-3)) opts=optimizer.minimize(cost) fluid.memory_optimize(fluid.default_main_program()) exe = fluid.Executor(fluid.CUDAPlace(0)) exe.run(program=fluid.default_startup_program()) if pretrained_model: print 'load model from: \n',pretrained_model def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) train_batch_size=Batch_size val_batch_size=Batch_size train_reader=paddle.batch(reader_train(),batch_size=train_batch_size) test_reader=paddle.batch(reader_val(),batch_size=val_batch_size) feeder = fluid.DataFeeder(place=fluid.CUDAPlace(0), feed_list=[image, label]) train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=cost.name) min_loss=1.2 fetch_list=[cost.name,acc_top1.name,acc_top5.name] for pass_id in range(num_epochs): train_info=[[],[],[]] test_info=[[],[],[]] for batch_id, data in enumerate(train_reader()): loss,acc1,acc5=train_exe.run(fetch_list,feed=feeder.feed(data)) loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) if batch_id %100==0: print("pass {0}, trainbatch {1}, loss {2}, acc1 {3}, acc5 {4}".format(pass_id, batch_id, loss, acc1, acc5)) sys.stdout.flush() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() cnt=0 for test_batch_id, data in enumerate(test_reader()): loss,acc1,acc5=exe.run(test_program, fetch_list=fetch_list, feed=feeder.feed(data)) loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss * len(data)) test_info[1].append(acc1 * len(data)) test_info[2].append(acc5 * len(data)) cnt+=len(data) #if test_batch_id % 10 == 0: # print("Pass {0},testbatch {1},loss {2},acc1 {3}, acc5 {4}".format(pass_id, test_batch_id, loss, acc1,acc5)) # sys.stdout.flush() test_loss = np.sum(test_info[0]) / cnt test_acc1 = np.sum(test_info[1]) / cnt test_acc5 = np.sum(test_info[2]) / cnt print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3},\n\t\t test_loss {4}, test_acc1 {5}, train_acc5 {6}" .format(pass_id,train_loss,train_acc1,train_acc5,test_loss,test_acc1,test_acc5)) sys.stdout.flush() if(min_loss>test_loss): min_loss=test_loss # 保存预测的模型 model_dir1 = model_dir + 'infer/' model_dir2 = model_dir + 'persist/' model_path_save1=os.path.join(model_dir1,str(pass_id)) model_path_save2=os.path.join(model_dir2,str(pass_id)) if not os.path.isdir(model_path_save1): os.makedirs(model_path_save1) if not os.path.isdir(model_path_save2): os.makedirs(model_path_save2) fluid.io.save_inference_model(model_path_save1,['image'],[out1],exe) fluid.io.save_persistables(exe,model_path_save2) print ("Save pass {0}, min_loss {1}".format(pass_id,min_loss)) if(pass_id==5 or pass_id==10 or pass_id==15 or pass_id==20): model_dir2 = model_dir + 'persist/' model_path_save2=os.path.join(model_dir2,str(pass_id)) if not os.path.isdir(model_path_save2): os.makedirs(model_path_save2) fluid.io.save_persistables(exe,model_path_save2) print ("Save pass {0}, test_loss {1}".format(pass_id,test_loss)) '''
def run_trainer(self, args): test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=2) if args.mem_opt: fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) if args.update_method == "pserver": t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode) trainer_prog = t.get_trainer_program() else: trainer_prog = fluid.default_main_program() if args.use_cuda: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(fluid.default_startup_program()) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 strategy.allow_op_delay = False build_stra = fluid.BuildStrategy() if args.use_reduce: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce else: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor(args.use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, build_strategy=build_stra) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) reader_generator = train_reader() def get_data(): origin_batch = next(reader_generator) if args.update_method == "pserver" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: new_batch.append(item) return new_batch else: return origin_batch need_save = bool(int(os.getenv("SAVE", "0"))) model_dir = os.getenv("MODEL_DIR", "") save_mode = os.getenv("SAVE_MODE", "") if save_mode == "LOCAL": if need_save: for _ in six.moves.xrange(RUN_STEP): loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) if need_save and model_dir: io.save_persistables(startup_exe, model_dir, trainer_prog) var = np.array( fluid.global_scope().find_var('__fc_b__').get_tensor()) if six.PY2: print(pickle.dumps(np.ravel(var).tolist())) else: sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist())) elif save_mode == "DIST": skip_steps = int(os.getenv("SKIP_STEPS")) loss = None if need_save: for idx in six.moves.xrange(8): loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) if need_save and model_dir and idx == skip_steps and args.trainer_id == 0: io.save_persistables(startup_exe, model_dir, trainer_prog) else: for idx in six.moves.xrange(8): data = get_data() if idx <= skip_steps: continue loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) if six.PY2: print(pickle.dumps(loss.tolist())) else: sys.stdout.buffer.write(pickle.dumps(loss.tolist())) else: raise Exception("save_mode must be LOCAL or DIST")
def train(args, config, train_file_list, optimizer_method): learning_rate = args.learning_rate batch_size = args.batch_size height = args.resize_h width = args.resize_w use_gpu = args.use_gpu use_pyramidbox = args.use_pyramidbox model_save_dir = args.model_save_dir pretrained_model = args.pretrained_model num_iterations = args.num_iteration parallel = args.parallel num_classes = 2 image_shape = [3, height, width] startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): py_reader = fluid.layers.py_reader( capacity=8, shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]], lod_levels=[0, 1, 1, 1], dtypes=["float32", "float32", "float32", "int32"], use_double_buffer=True) with fluid.unique_name.guard(): image, face_box, head_box, gt_label = fluid.layers.read_file( py_reader) fetches = [] network = PyramidBox(image=image, face_box=face_box, head_box=head_box, gt_label=gt_label, sub_network=use_pyramidbox) if use_pyramidbox: face_loss, head_loss, loss = network.train() fetches = [face_loss, head_loss] else: loss = network.vgg_ssd_loss() fetches = [loss] devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) batch_size_per_device = batch_size // devices_num steps_per_pass = 12880 // batch_size boundaries = [ steps_per_pass * 50, steps_per_pass * 80, steps_per_pass * 120, steps_per_pass * 140 ] values = [ learning_rate, learning_rate * 0.5, learning_rate * 0.25, learning_rate * 0.1, learning_rate * 0.01 ] if optimizer_method == "momentum": optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay( boundaries=boundaries, values=values), momentum=0.9, regularization=fluid.regularizer.L2Decay(0.0005), ) else: optimizer = fluid.optimizer.RMSProp( learning_rate=fluid.layers.piecewise_decay( boundaries, values), regularization=fluid.regularizer.L2Decay(0.0005), ) optimizer.minimize(loss) fluid.memory_optimize(train_prog) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) start_pass = 0 if pretrained_model: if pretrained_model.isdigit(): start_pass = int(pretrained_model) + 1 pretrained_model = os.path.join(model_save_dir, pretrained_model) print("Resume from %s " % (pretrained_model)) if not os.path.exists(pretrained_model): raise ValueError( "The pre-trained model path [%s] does not exist." % (pretrained_model)) def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) if parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_gpu, loss_name=loss.name, main_program=train_prog) train_reader = reader.train(config, train_file_list, batch_size_per_device, shuffle=False, use_multiprocessing=True, num_workers=8, max_queue=24) py_reader.decorate_paddle_reader(train_reader) def run(iterations): # global feed_data py_reader.start() run_time = [] for batch_id in range(iterations): start_time = time.time() if parallel: fetch_vars = train_exe.run( fetch_list=[v.name for v in fetches]) else: fetch_vars = exe.run(train_prog, fetch_list=fetches) end_time = time.time() run_time.append(end_time - start_time) fetch_vars = [np.mean(np.array(v)) for v in fetch_vars] if not args.use_pyramidbox: print("Batch {0}, loss {1}".format(batch_id, fetch_vars[0])) else: print("Batch {0}, face loss {1}, head loss {2}".format( batch_id, fetch_vars[0], fetch_vars[1])) return run_time # start-up run(2) # profiling start = time.time() if not parallel: with profiler.profiler('All', 'total', '/tmp/profile_file'): run_time = run(num_iterations) else: run_time = run(num_iterations) end = time.time() total_time = end - start print("Total time: {0}, reader time: {1} s, run time: {2} s".format( total_time, total_time - np.sum(run_time), np.sum(run_time)))
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) reader = task_reader.SequenceLabelReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: fetch_list = [ graph_vars["num_infer"].name, graph_vars["num_label"].name, graph_vars["num_correct"].name, graph_vars["loss"].name, graph_vars['learning_rate'].name, ] out = train_exe.run(fetch_list=fetch_list) num_infer, num_label, num_correct, np_loss, np_lr = out lr = float(np_lr[0]) loss = np_loss.mean() precision, recall, f1 = calculate_f1( num_label, num_infer, num_correct) if args.verbose: log.info( "train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(), lr if warmup_steps > 0 else args.learning_rate)) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin log.info( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "f1: %f, precision: %f, recall: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, loss, f1, precision, recall, args.skip_steps / used_time)) time_begin = time.time() if nccl2_trainer_id == 0 and steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if nccl2_trainer_id == 0 and steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # evaluate test set if args.do_test: predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if nccl2_trainer_id == 0 and args.do_val: if not args.do_train: current_example, current_epoch = reader.get_train_progress() evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, 'final') if nccl2_trainer_id == 0 and args.do_test: if not args.do_train: current_example, current_epoch = reader.get_train_progress() predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, 'final')
def main(): if args.data_set == "cifar10": classdim = 10 if args.data_format == 'NCHW': data_shape = [3, 32, 32] else: data_shape = [32, 32, 3] else: classdim = 102 if args.data_format == 'NCHW': data_shape = [3, 224, 224] else: data_shape = [224, 224, 3] # Input data images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program net = vgg16_bn_drop(images) predict = fluid.layers.fc(input=net, size=classdim, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy( input=predict, label=label, total=batch_size_tensor) # inference program inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) # Optimization optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) opts = optimizer.minimize(avg_cost) fluid.memory_optimize(fluid.default_main_program()) # Initialize executor place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) # Parameter initialization exe.run(fluid.default_startup_program()) # data reader train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.cifar.train10() if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), buf_size=5120), batch_size=args.batch_size) test_reader = paddle.batch( paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), batch_size=args.batch_size) # test def test(exe): test_accuracy = fluid.average.WeightedAverage() for batch_id, data in enumerate(test_reader()): img_data = np.array(map(lambda x: x[0].reshape(data_shape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) acc, weight = exe.run(inference_program, feed={"pixel": img_data, "label": y_data}, fetch_list=[batch_acc, batch_size_tensor]) test_accuracy.add(value=acc, weight=weight) return test_accuracy.eval() iters, num_samples, start_time = 0, 0, time.time() accuracy = fluid.average.WeightedAverage() train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) for pass_id in range(args.pass_num): accuracy.reset() train_accs = [] train_losses = [] for batch_id, data in enumerate(train_reader()): if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 if iters == args.iterations: break img_data = np.array(map(lambda x: x[0].reshape(data_shape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) loss, acc, weight = train_exe.run( feed={"pixel": img_data, "label": y_data}, fetch_list=[ avg_cost.name, batch_acc.name, batch_size_tensor.name ]) accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight)) iters += 1 num_samples += len(y_data) loss = np.mean(np.array(loss)) acc = np.mean(np.array(acc)) print( "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % (pass_id, iters, loss, acc) ) # The accuracy is the accumulation of batches, but not the current batch. # pass_train_acc = accuracy.eval() train_losses.append(loss) train_accs.append(acc) print("Pass: %d, Loss: %f, Train Accuray: %f\n" % (pass_id, np.mean(train_losses), np.mean(train_accs))) train_elapsed = time.time() - start_time examples_per_sec = num_samples / train_elapsed print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % (num_samples, train_elapsed, examples_per_sec)) # evaluation if args.with_test: pass_test_acc = test(exe) exit(0)
def main(): IMG_SIZE =[1536, 512] SUBMISSION_SIZE = [3384, 1710] save_test_logits = False num_classes = 8 batch_size = 4 log_iters = 100 network = 'unet_simple' # Define paths for each model if network == 'deeplabv3p': model_path = "./model_weights/paddle_deeplabv3p_8_end_060223" npy_dir = '/npy_save/deeplabv3p/' elif network == 'unet_base': model_path = "./model_weights/paddle_unet_base_10_end_059909" npy_dir = '/npy_save/unet_base/' elif network == 'unet_simple': model_path = "./model_weights/paddle_unet_simple_12_end_060577" npy_dir = '/npy_save/unet_simple/' program_choice = 2 # 1 - Validtion; 2 - Test show_label = False crop_offset = 690 data_dir = './data_list/val.csv' test_dir = '../PaddlePaddle/TestSet_Final/ColorImage/' sub_dir = './test_submission/' # Get data list and split it into train and validation set. val_list = pd.read_csv(data_dir) #Initialization images = fluid.layers.data(name='image', shape=[3, IMG_SIZE[1], IMG_SIZE[0]], dtype='float32') labels = fluid.layers.data(name='label', shape=[1, IMG_SIZE[1], IMG_SIZE[0]], dtype='float32') iter_id = 0 total_loss = 0.0 total_miou = 0.0 prev_time = time.time() # Validation if program_choice == 1: val_reader = val_image_gen(val_list, batch_size=batch_size, image_size=IMG_SIZE, crop_offset=crop_offset) reduced_loss, miou, pred = create_network(images, labels, num_classes, network=network, image_size=(IMG_SIZE[1], IMG_SIZE[0]), for_test=False) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) fluid.io.load_params(exe, model_path) print("loaded model from: %s" % model_path) # Parallel Executor to use multi-GPUs exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = True build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce train_exe = fluid.ParallelExecutor(use_cuda=True, build_strategy=build_strategy, exec_strategy=exec_strategy) print('Start Validation!') for iteration in range(int(len(val_list) / batch_size)): val_data = next(val_reader) results = train_exe.run( feed=get_feeder_data(val_data, place), fetch_list=[reduced_loss.name, miou.name, pred.name]) if iter_id % log_iters == 0: print('Finished Processing %d Images.' %(iter_id * batch_size)) iter_id += 1 total_loss += np.mean(results[0]) total_miou += np.mean(results[1]) # label to mask if show_label == True: label_image = val_data[1][0] color_label_mask = decode_color_labels(label_image) color_label_mask = np.transpose(color_label_mask, (1, 2, 0)) cv2.imshow('gt_label', cv2.resize(color_label_mask, (IMG_SIZE[0], IMG_SIZE[1]))) prediction = np.argmax(results[2][0], axis=0) color_pred_mask = decode_color_labels(prediction) color_pred_mask = np.transpose(color_pred_mask, (1, 2, 0)) cv2.imshow('pred_label', cv2.resize(color_pred_mask, (IMG_SIZE[0], IMG_SIZE[1]))) cv2.waitKey(0) end_time = time.time() print("validation loss: %.3f, mean iou: %.3f, time cost: %.3f s" % (total_loss / iter_id, total_miou / iter_id, end_time - prev_time)) # Test elif program_choice == 2: predictions = create_network(images, labels, num_classes, network=network, image_size=(IMG_SIZE[1], IMG_SIZE[0]), for_test=True) place = fluid.CUDAPlace(0) # place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) fluid.io.load_params(exe, model_path) print("loaded model from: %s" % model_path) print('Start Making Submissions!') test_list = os.listdir(test_dir) for test_name in test_list: test_ori_image = cv2.imread(os.path.join(test_dir, test_name)) test_image = crop_resize_data(test_ori_image, label=None, image_size=IMG_SIZE, offset=crop_offset) out_image = np.expand_dims(np.array(test_image), axis=0) out_image = out_image[:, :, :, ::-1].transpose(0, 3, 1, 2).astype(np.float32) / (255.0 / 2) - 1 feed_dict = {} feed_dict["image"] = out_image results_1 = exe.run( feed=feed_dict, fetch_list=[predictions]) if iter_id % 20 == 0: print('Finished Processing %d Images.' %(iter_id)) iter_id += 1 prediction = np.argmax(results_1[0][0], axis=0) # Save npy files if save_test_logits == True: np.save(npy_dir + test_name.replace('.jpg', '.npy'), results_1[0][0]) # Save Submission PNG submission_mask = expand_resize_data(prediction, SUBMISSION_SIZE, crop_offset) cv2.imwrite(os.path.join(sub_dir, test_name.replace('.jpg', '.png')), submission_mask) # Show Label if show_label == True: cv2.imshow('test_image', cv2.resize(test_ori_image,(IMG_SIZE[0], IMG_SIZE[1]))) cv2.imshow('pred_label', cv2.resize(submission_mask,(IMG_SIZE[0], IMG_SIZE[1]))) cv2.waitKey(0) sys.stdout.flush()
def train(args, data_args, train_params, train_file_list, val_file_list): model_save_dir = args.model_save_dir pretrained_model = args.pretrained_model use_gpu = args.use_gpu parallel = args.parallel enable_ce = args.enable_ce is_shuffle = True if not use_gpu: devices_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: devices_num = fluid.core.get_cuda_device_count() batch_size = train_params['batch_size'] epoc_num = train_params['epoc_num'] batch_size_per_device = batch_size // devices_num num_workers = 8 startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if enable_ce: import random random.seed(0) np.random.seed(0) is_shuffle = False startup_prog.random_seed = 111 train_prog.random_seed = 111 test_prog.random_seed = 111 train_py_reader, loss = build_program(main_prog=train_prog, startup_prog=startup_prog, train_params=train_params, is_train=True) test_py_reader, map_eval, _, _ = build_program(main_prog=test_prog, startup_prog=startup_prog, train_params=train_params, is_train=False) test_prog = test_prog.clone(for_test=True) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, main_program=train_prog, predicate=if_exist) if parallel: train_exe = fluid.ParallelExecutor(main_program=train_prog, use_cuda=use_gpu, loss_name=loss.name) train_reader = reader.train(data_args, train_file_list, batch_size_per_device, shuffle=is_shuffle, num_workers=num_workers, enable_ce=enable_ce) test_reader = reader.test(data_args, val_file_list, batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) def save_model(postfix, main_prog): model_path = os.path.join(model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) print('save models to %s' % (model_path)) fluid.io.save_persistables(exe, model_path, main_program=main_prog) best_map = 0. def test(epoc_id, best_map): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) every_epoc_map = [] # for CE test_py_reader.start() try: batch_id = 0 while True: test_map, = exe.run(test_prog, fetch_list=[accum_map]) if batch_id % 10 == 0: every_epoc_map.append(test_map) print("Batch {0}, map {1}".format(batch_id, test_map)) batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() mean_map = np.mean(every_epoc_map) print("Epoc {0}, test map {1}".format(epoc_id, test_map[0])) if test_map[0] > best_map: best_map = test_map[0] save_model('best_model', test_prog) return best_map, mean_map total_time = 0.0 for epoc_id in range(epoc_num): epoch_idx = epoc_id + 1 start_time = time.time() prev_start_time = start_time every_epoc_loss = [] batch_id = 0 train_py_reader.start() while True: try: prev_start_time = start_time start_time = time.time() if parallel: loss_v, = train_exe.run(fetch_list=[loss.name]) else: loss_v, = exe.run(train_prog, fetch_list=[loss]) loss_v = np.mean(np.array(loss_v)) every_epoc_loss.append(loss_v) if batch_id % 10 == 0: print("Epoc {:d}, batch {:d}, loss {:.6f}, time {:.5f}". format(epoc_id, batch_id, loss_v, start_time - prev_start_time)) batch_id += 1 except (fluid.core.EOFException, StopIteration): train_reader().close() train_py_reader.reset() break end_time = time.time() total_time += end_time - start_time best_map, mean_map = test(epoc_id, best_map) print("Best test map {0}".format(best_map)) if epoc_id % 10 == 0 or epoc_id == epoc_num - 1: save_model(str(epoc_id), train_prog) if enable_ce: train_avg_loss = np.mean(every_epoc_loss) if devices_num == 1: print("kpis train_cost %s" % train_avg_loss) print("kpis test_acc %s" % mean_map) print("kpis train_speed %s" % (total_time / epoch_idx)) else: print("kpis train_cost_card%s %s" % (devices_num, train_avg_loss)) print("kpis test_acc_card%s %s" % (devices_num, mean_map)) print("kpis train_speed_card%s %f" % (devices_num, total_time / epoch_idx))
def main(args): """main""" reader = task_reader.RoleSequenceLabelReader( vocab_path=args.vocab_path, labels_map=labels_map, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: fetch_list = [ graph_vars["num_infer"].name, graph_vars["num_label"].name, graph_vars["num_correct"].name, graph_vars["loss"].name, graph_vars['learning_rate'].name, ] out = train_exe.run(fetch_list=fetch_list) num_infer, num_label, num_correct, np_loss, np_lr = out lr = float(np_lr[0]) loss = np_loss.mean() precision, recall, f1 = calculate_f1( num_label, num_infer, num_correct) if args.verbose: print( "train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(), lr if warmup_steps > 0 else args.learning_rate)) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( u"【train】epoch: {}, step: {}, loss: {:.6f}, " "f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, speed: {:.3f} steps/s" .format(current_epoch, steps, float(loss), float(f1), float(precision), float(recall), args.skip_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: precision, recall, f1 = evaluate_wrapper( reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}" .format(float(precision), float(recall), float(f1))) # evaluate test set if args.do_test: precision, recall, f1 = evaluate_wrapper( reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( u"【test】precision {:.4f} , recall {:.4f}, f1-score {:.4f}" .format(float(precision), float(recall), float(f1))) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "final_model") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: precision, recall, f1 = evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') print(u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}".format( float(precision), float(recall), float(f1))) if args.do_test: test_ret = predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') utils.write_by_lines(args.trigger_pred_save_path, test_ret)
def sync_weights_to(self, target_model, decay=0.0, share_vars_parallel_executor=None): """Synchronize parameters of current model to another model. To speed up the synchronizing process, it will create a program implicitly to finish the process. It also stores a program as the cache to avoid creating program repeatedly. target_model_weights = decay * target_model_weights + (1 - decay) * current_model_weights Args: target_model (`parl.Model`): an instance of ``Model`` that has the same neural network architecture as the current model. decay (float): the rate of decline in copying parameters. 0 if no parameters decay when synchronizing the parameters. share_vars_parallel_executor (fluid.ParallelExecutor): Optional. If not None, will use ``fluid.ParallelExecutor`` to run program instead of ``fluid.Executor``. Example: .. code-block:: python import copy # create a model that has the same neural network structures. target_model = copy.deepcopy(model) # after initilizing the parameters ... model.sync_weights_to(target_mdodel) Note: Before calling ``sync_weights_to``, parameters of the model must have been initialized. """ args_hash_id = hashlib.md5('{}_{}'.format( id(target_model), decay).encode('utf-8')).hexdigest() has_cached = False try: if self._cached_id == args_hash_id: has_cached = True except AttributeError: has_cached = False if not has_cached: # Can not run _cached program, need create a new program self._cached_id = args_hash_id assert not target_model is self, "cannot copy between identical model" assert isinstance(target_model, Model) assert self.__class__.__name__ == target_model.__class__.__name__, \ "must be the same class for params syncing!" assert (decay >= 0 and decay <= 1) param_pairs = self._get_parameter_pairs(self, target_model) self._cached_sync_weights_program = fluid.Program() with fluid.program_guard(self._cached_sync_weights_program): for (src_var_name, target_var_name) in param_pairs: src_var = fetch_framework_var(src_var_name) target_var = fetch_framework_var(target_var_name) fluid.layers.assign( decay * target_var + (1 - decay) * src_var, target_var) if share_vars_parallel_executor is None: # use fluid.Executor place = fluid.CUDAPlace(0) if machine_info.is_gpu_available( ) else fluid.CPUPlace() self._cached_fluid_executor = fluid.Executor(place) else: # use fluid.ParallelExecutor # specify strategy to make ParallelExecutor run faster exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = True with fluid.scope_guard(fluid.global_scope().new_scope()): self._cached_fluid_executor = fluid.ParallelExecutor( use_cuda=machine_info.is_gpu_available(), main_program=self._cached_sync_weights_program, share_vars_from=share_vars_parallel_executor, exec_strategy=exec_strategy, build_strategy=build_strategy, ) if share_vars_parallel_executor is None: self._cached_fluid_executor.run(self._cached_sync_weights_program) else: self._cached_fluid_executor.run(fetch_list=[])
def check_network_convergence(self, method, use_cuda=True, memory_opt=True, iter=50, batch_size=None, allow_op_delay=False, feed_dict=None, seed=None, use_parallel_executor=True, use_reduce=False, fuse_elewise_add_act_ops=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, enable_sequential_execution=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) elif isinstance(exe, fluid.Executor): if program is None: program = fluid.default_main_program() res = exe.run(program=program, feed=feed, fetch_list=fetch_list) else: raise ValueError('Unkown type exe') return res main = fluid.Program() startup = fluid.Program() startup.random_seed = 1 # Fix random seed main.random_seed = 1 with fluid.program_guard(main, startup): if seed is not None: startup.random_seed = seed main.random_seed = seed loss = method(use_feed=feed_dict is not None) optimizer().minimize(loss) if memory_opt: fluid.memory_optimize(main) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(startup) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = allow_op_delay if use_fast_executor: exec_strategy.use_experimental_executor = True build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.enable_sequential_execution = enable_sequential_execution if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True if use_parallel_executor: exe = fluid.ParallelExecutor(use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) else: exe = fluid.Executor(place=place) if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count( ) if use_cuda else int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() first_loss, = run_executor(exe=exe, feed=feed_dict, fetch_list=[loss.name]) for i in range(iter): run_executor(exe=exe, feed=feed_dict, fetch_list=[]) last_loss, = run_executor(exe=exe, feed=feed_dict, fetch_list=[loss.name]) end = time.time() if batch_size is not None: print("%.4f Instance per second" % ((batch_size * iter + 2) / (end - begin))) avg_last_loss_val = np.array(last_loss).mean() avg_first_loss_val = np.array(first_loss).mean() if math.isnan(float(avg_last_loss_val)) or math.isnan( float(avg_first_loss_val)): sys.exit("got NaN loss, training failed.") print(first_loss, last_loss) # self.assertGreater(first_loss[0], last_loss[0]) return first_loss, last_loss
def main(args): """main func""" unimo_config = UNIMOConfig(args.unimo_config_path) if args.task_type == "dialog": unimo_config["role_type_size"] = args.role_type_size unimo_config["turn_type_size"] = args.turn_type_size if args.hidden_dropout_prob >= 0: unimo_config["hidden_dropout_prob"] = args.hidden_dropout_prob if args.attention_probs_dropout_prob >= 0: unimo_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob unimo_config.print_config() if args.pred_batch_size <= 0: args.pred_batch_size = args.batch_size gpu_id = 0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None: gpu_list = os.getenv("FLAGS_selected_gpus").split(",") gpus = len(gpu_list) gpu_id = int(gpu_list[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = gpus else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) """load vocabulary""" tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file, encoder_json_file=args.encoder_json_file, vocab_bpe_file=args.vocab_bpe_file, do_lower_case=True) reader = Seq2SeqReader(tokenizer, args) unimo_seq2seq = Seq2Seq(args, unimo_config, tokenizer) if not (args.do_train or args.do_val or args.do_test or args.do_pred): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=trainers_num, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // trainers_num else: max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id)) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = unimo_seq2seq.create_model() scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test or args.do_pred: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = unimo_seq2seq.create_model( decoding=args.do_decode) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" if args.nccl_comm_num > 1: config.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks: config.use_hierarchical_allreduce = args.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks assert config.hierarchical_allreduce_inter_nranks > 1 assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0 config.hierarchical_allreduce_exter_nranks = \ trainers_num / config.hierarchical_allreduce_inter_nranks t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) init_model(args, exe, train_program if args.do_train else test_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 if args.use_fp16 else 2 # 2 for fp32 4 for fp16 exec_strategy.num_iteration_per_drop_scope = min( args.num_iteration_per_drop_scope, args.skip_steps) build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False if args.use_fuse: build_strategy.fuse_all_reduce_ops = True train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.set_batch_generator(train_data_generator) train_resource = { "exe": train_exe, "program": train_program, "pyreader": train_pyreader } save_model = partial(save_checkpoint, program=train_program, exe=exe) test_dev_count = 1 if args.do_val or args.do_test or args.do_pred: test_exe = exe if args.use_multi_gpu_test: test_dev_count = nccl2_num_trainers test_resource = { "exe": test_exe, "program": test_prog, "pyreader": test_pyreader } eval_data_generator = partial(reader.data_generator, batch_size=args.pred_batch_size, epoch=1, dev_count=test_dev_count, shuffle=False, do_decode=args.do_decode, place=place) eval_func = partial(unimo_seq2seq.evaluate, resource=test_resource, graph_vars=test_graph_vars, dev_count=test_dev_count, output_path=args.checkpoints, gpu_id=nccl2_trainer_id) evaluate = partial(evaluate_datasets, pyreader=test_pyreader, reader=reader, eval_func=eval_func, data_generator=eval_data_generator) if args.do_train: train_pyreader.start() steps = 0 last_epoch = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() skip_steps = args.skip_steps while True: try: steps += 1 if args.save_and_valid_by_epoch: suffix = "epoch_" + str(last_epoch) else: suffix = "step_" + str(steps) if steps % skip_steps == 0: outputs = unimo_seq2seq.evaluate(train_resource, "train", graph_vars) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %.8f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) if args.in_tokens: current_example, current_epoch = reader.get_train_progress( ) else: current_epoch = steps * args.batch_size * trainers_num // num_train_examples current_example = steps * args.batch_size * trainers_num % num_train_examples time_end = time.time() used_time = time_end - time_begin print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["ppl"], args.skip_steps / used_time)) time_begin = time.time() if args.visualdl_log and nccl2_trainer_id == 0: visuallog_dict = OrderedDict() visuallog_dict["ppl"] = outputs["ppl"] visualdl_log(visuallog_dict, outputs["ppl"], steps, phase='train') else: train_exe.run(fetch_list=[]) if nccl2_trainer_id >= test_dev_count: continue do_save = False do_eval = False if not args.save_and_valid_by_epoch: if steps % args.save_steps == 0 and nccl2_trainer_id == 0: do_save = True if steps % args.validation_steps == 0: do_eval = True else: if args.in_tokens: current_example, current_epoch = reader.get_train_progress( ) else: current_epoch = steps * args.batch_size * trainers_num // num_train_examples if current_epoch != last_epoch: if nccl2_trainer_id == 0: do_save = True do_eval = True if do_save: save_model(suffix=suffix) if do_eval: evaluate(suffix=suffix) if args.save_and_valid_by_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_model(suffix=suffix) train_pyreader.reset() break if nccl2_trainer_id >= test_dev_count: return if args.do_val or args.do_test or args.do_pred: suffix = "output" if args.do_train: if not args.save_and_valid_by_epoch: suffix = "step_" + str(steps) else: suffix = "epoch_" + str(last_epoch) evaluate(suffix=suffix, do_pred=True)
init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog) metric = Metric(**graph_model.metrics) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if dev_count > 1: exec_strategy = F.ExecutionStrategy() exec_strategy.num_threads = dev_count train_exe = F.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_model.loss.name, exec_strategy=exec_strategy, main_program=train_prog, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) test_exe = exe else: train_exe, test_exe = exe, exe train_and_evaluate(exe=exe, train_exe=train_exe, valid_exe=test_exe, train_ds=train_ds, valid_ds=valid_ds, test_ds=test_ds, train_prog=train_prog, valid_prog=test_prog,
def train(): args = parse_args() if args.enable_ce: SEED = 102 fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED config_path = args.config_path train_path = args.train_dir epoch_num = args.epoch_num use_cuda = True if args.use_cuda else False use_parallel = True if args.parallel else False logger.info("reading data begins") user_count, item_count, cat_count = reader.config_read(config_path) data_reader, max_len = reader.prepare_reader( train_path, args.batch_size * args.num_devices) logger.info("reading data completes") avg_cost, pred = network.network(item_count, cat_count, max_len) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=5.0)) base_lr = args.base_lr boundaries = [410000] values = [base_lr, 0.2] sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values)) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) feeder = fluid.DataFeeder(feed_list=[ "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label", "mask", "target_item_seq", "target_cat_seq" ], place=place) if use_parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name) else: train_exe = exe logger.info("train begins") global_step = 0 PRINT_STEP = 1000 total_time = [] ce_info = [] start_time = time.time() loss_sum = 0.0 for id in range(epoch_num): epoch = id + 1 for data in data_reader(): global_step += 1 results = train_exe.run(feed=feeder.feed(data), fetch_list=[avg_cost.name, pred.name], return_numpy=True) loss_sum += results[0].mean() if global_step % PRINT_STEP == 0: ce_info.append(loss_sum / PRINT_STEP) total_time.append(time.time() - start_time) logger.info( "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f" % (epoch, global_step, loss_sum / PRINT_STEP, time.time() - start_time)) start_time = time.time() loss_sum = 0.0 if (global_step > 400000 and global_step % PRINT_STEP == 0) or (global_step <= 400000 and global_step % 50000 == 0): save_dir = os.path.join(args.model_dir, "global_step_" + str(global_step)) feed_var_name = [ "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label", "mask", "target_item_seq", "target_cat_seq" ] fetch_vars = [avg_cost, pred] fluid.io.save_inference_model(save_dir, feed_var_name, fetch_vars, exe) logger.info("model saved in " + save_dir) if args.enable_ce and global_step >= args.batch_num: break # only for ce if args.enable_ce: gpu_num = get_cards(args) ce_loss = 0 ce_time = 0 try: ce_loss = ce_info[-1] ce_time = total_time[-1] except: print("ce info error") print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, ce_time)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, ce_loss))
def run_benchmark(model, args): if args.use_cprof: pr = cProfile.Profile() pr.enable() if args.data_set == "cifar10": class_dim = 10 if args.data_format == 'NCHW': dshape = [3, 32, 32] else: dshape = [32, 32, 3] else: class_dim = 102 if args.data_format == 'NCHW': dshape = [3, 224, 224] else: dshape = [224, 224, 3] input = fluid.layers.data(name='data', shape=dshape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') predict = model(input, class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size_tensor) inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) opts = optimizer.minimize(avg_cost) fluid.memory_optimize(fluid.default_main_program()) train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.cifar.train10() if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), buf_size=5120), batch_size=args.batch_size) test_reader = paddle.batch(paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), batch_size=args.batch_size) def test(exe): test_accuracy = fluid.average.WeightedAverage() for batch_id, data in enumerate(test_reader()): img_data = np.array(map(lambda x: x[0].reshape(dshape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) acc, weight = exe.run(inference_program, feed={ "data": img_data, "label": y_data }, fetch_list=[batch_acc, batch_size_tensor]) test_accuracy.add(value=acc, weight=weight) return test_accuracy.eval() place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) accuracy = fluid.average.WeightedAverage() train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) if args.use_fake_data: data = train_reader().next() image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype('float32') label = np.array(map(lambda x: x[1], data)).astype('int64') label = label.reshape([-1, 1]) iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): accuracy.reset() train_accs = [] train_losses = [] for batch_id, data in enumerate(train_reader()): if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 if iters == args.iterations: break if not args.use_fake_data: image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype('float32') label = np.array(map(lambda x: x[1], data)).astype('int64') label = label.reshape([-1, 1]) loss, acc, weight = train_exe.run(feed={ 'data': image, 'label': label }, fetch_list=[ avg_cost.name, batch_acc.name, batch_size_tensor.name ]) iters += 1 num_samples += len(label) accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight)) loss = np.mean(np.array(loss)) acc = np.mean(np.array(acc)) train_losses.append(loss) train_accs.append(acc) print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % (pass_id, iters, loss, acc)) print("Pass: %d, Loss: %f, Train Accuray: %f\n" % (pass_id, np.mean(train_losses), np.mean(train_accs))) train_elapsed = time.time() - start_time examples_per_sec = num_samples / train_elapsed print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % (num_samples, train_elapsed, examples_per_sec)) # evaluation if args.with_test: pass_test_acc = test(exe) exit(0)
def train(args): # parameters from arguments seg_num = args.seg_num class_dim = args.class_dim num_layers = args.num_layers num_epochs = args.num_epochs batch_size = args.batch_size pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir image_shape = [int(m) for m in args.image_shape.split(",")] image_shape = [seg_num] + image_shape # model definition model = TSN_ResNet(layers=num_layers, seg_num=seg_num) image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = model.net(input=image, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) # for test inference_program = fluid.default_main_program().clone(for_test=True) # learning rate strategy epoch_points = [num_epochs / 3, num_epochs * 2 / 3] total_videos = args.total_videos step = int(total_videos / batch_size + 1) bd = [e * step for e in epoch_points] lr_init = args.lr_init lr = [lr_init, lr_init / 10, lr_init / 100] # initialize optimizer optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) opts = optimizer.minimize(avg_cost) if args.with_mem_opt: fluid.memory_optimize(fluid.default_main_program()) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) def is_parameter(var): if isinstance(var, Parameter): return isinstance(var, Parameter) and (not ("fc_0" in var.name)) if pretrained_model is not None: vars = filter(is_parameter, inference_program.list_vars()) fluid.io.load_vars(exe, pretrained_model, vars=vars) # reader train_reader = paddle.batch(reader.train(seg_num), batch_size=batch_size, drop_last=True) # test in single GPU test_reader = paddle.batch(reader.test(seg_num), batch_size=batch_size / 16) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] # train for pass_id in range(num_epochs): train_info = [[], [], []] test_info = [[], [], []] for batch_id, data in enumerate(train_reader()): t1 = time.time() loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) if batch_id % 10 == 0: print( "[TRAIN] Pass: {0}\ttrainbatch: {1}\tloss: {2}\tacc1: {3}\tacc5: {4}\ttime: {5}" .format(pass_id, batch_id, '%.6f' % loss, acc1, acc5, "%2.2f sec" % period)) sys.stdout.flush() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() # test cnt = 0 for batch_id, data in enumerate(test_reader()): t1 = time.time() loss, acc1, acc5 = exe.run(inference_program, fetch_list=fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss * len(data)) test_info[1].append(acc1 * len(data)) test_info[2].append(acc5 * len(data)) cnt += len(data) if batch_id % 10 == 0: print( "[TEST] Pass: {0}\ttestbatch: {1}\tloss: {2}\tacc1: {3}\tacc5: {4}\ttime: {5}" .format(pass_id, batch_id, '%.6f' % loss, acc1, acc5, "%2.2f sec" % period)) sys.stdout.flush() test_loss = np.sum(test_info[0]) / cnt test_acc1 = np.sum(test_info[1]) / cnt test_acc5 = np.sum(test_info[2]) / cnt print( "+ End pass: {0}, train_loss: {1}, train_acc1: {2}, train_acc5: {3}" .format(pass_id, '%.3f' % train_loss, '%.3f' % train_acc1, '%.3f' % train_acc5)) print( "+ End pass: {0}, test_loss: {1}, test_acc1: {2}, test_acc5: {3}". format(pass_id, '%.3f' % test_loss, '%.3f' % test_acc1, '%.3f' % test_acc5)) sys.stdout.flush() # save model model_path = os.path.join(model_save_dir, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path)
def train(): learning_rate = cfg.learning_rate image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] devices_num = get_device_num() total_batch_size = devices_num * cfg.TRAIN.im_per_batch use_random = True model = east_builder.East( add_conv_body_func=resnet. add_ResNet50_convs_body, # res4: [-1, 1024, 84, 84] add_feature_merging_func=resnet. add_feature_merging_func, # res5: [-1, 2048, 7, 7] use_pyreader=cfg.use_pyreader, use_random=use_random) model.build_model(image_shape) losses, keys = model.loss() loss = losses[0] fetch_list = losses boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) values = [learning_rate * (gamma**i) for i in range(step_num + 1)] lr = exponential_with_warmup_decay(learning_rate=learning_rate, boundaries=boundaries, values=values, warmup_iter=cfg.warm_up_iter, warmup_factor=cfg.warm_up_factor) optimizer = fluid.optimizer.Momentum( learning_rate=lr, regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum) optimizer.minimize(loss) fetch_list = fetch_list + [lr] for var in fetch_list: var.persistable = True gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if cfg.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist) if cfg.parallel: build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = True exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_iteration_per_drop_scope = 10 if num_trainers > 1 and cfg.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, fluid.default_main_program()) # the process is fast when num_threads is 1 for multi-process training exec_strategy.num_threads = 1 train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu), loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_exe = exe shuffle = True # NOTE: do not shuffle dataset when using multi-process training shuffle_seed = None if num_trainers > 1: shuffle_seed = 1 if cfg.use_pyreader: train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch, total_batch_size=total_batch_size, padding_total=cfg.TRAIN.padding_minibatch, shuffle=shuffle, shuffle_seed=shuffle_seed) if num_trainers > 1: assert shuffle_seed is not None, "If num_trainers > 1, the shuffle_seed must be set, because the order of batch data generated by reader must be the same in the respective processes" train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) else: if num_trainers > 1: shuffle = False train_reader = reader.train(batch_size=total_batch_size, shuffle=shuffle) feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) def save_model(postfix): model_path = os.path.join(cfg.model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) fluid.io.save_persistables(exe, model_path) def train_loop_pyreader(): py_reader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() except (StopIteration, fluid.core.EOFException): py_reader.reset() def train_loop(): train_stats = TrainingStats(cfg.log_window, keys) start_time = time.time() for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() stats = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(stats) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) if (iter_id + 1) == cfg.max_iter: break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() if cfg.use_pyreader: train_loop_pyreader() else: train_loop() save_model('model_final')
def main(args): train_prog = fluid.Program() startup_prog = fluid.Program() train_prog.random_seed = 1000 startup_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=False) lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) optimizer = fluid.optimizer.Adam( learning_rate=lr_decay * TrainTaskConfig.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(avg_cost) if args.use_mem_opt: fluid.memory_optimize(train_prog) if TrainTaskConfig.use_gpu: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) else: exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # For faster executor exec_strategy.use_experimental_executor = True exec_strategy.num_iteration_per_drop_scope = 5 build_strategy = fluid.BuildStrategy() # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor( use_cuda=TrainTaskConfig.use_gpu, loss_name=avg_cost.name, main_program=train_prog, build_strategy=build_strategy, exec_strategy=exec_strategy) # the best cross-entropy value with label smoothing loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps )) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / ( ModelHyperParams.trg_vocab_size - 1) + 1e-20)) train_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader) if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = train_data() def run(iter_num): reader_time = [] run_time = [] for step_idx in six.moves.xrange(iter_num): try: start_time = time.time() feed_dict_list = prepare_feed_dict_list(data_generator, init_flag, dev_count) end_time = time.time() reader_time.append(end_time - start_time) start_time = time.time() if args.use_parallel_exe: outs = train_exe.run( fetch_list=[sum_cost.name, token_num.name], feed=feed_dict_list) else: outs = exe.run(program=train_prog, fetch_list=[sum_cost.name, token_num.name], feed=feed_dict_list[0] if feed_dict_list is not None else None) end_time = time.time() run_time.append(end_time - start_time) sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[ 1]) # sum the cost from multi-devices total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print("step_idx: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) except (StopIteration, fluid.core.EOFException): # The current pass is over. if args.use_py_reader: pyreader.reset() pyreader.start() return reader_time, run_time @contextlib.contextmanager def profile_context(profile=True): if profile: with profiler.profiler('All', 'total', '/tmp/profile_file'): yield else: yield # start-up init_flag = True run(5) init_flag = False # profiling start = time.time() # currently only support profiling on one device with profile_context(args.profile_ops): reader_time, run_time = run(args.iter_num) end = time.time() total_time = end - start print( "Total time: {0}, reader time: {1} s, run time: {2} s, step number: {3}". format(total_time, np.sum(reader_time), np.sum(run_time), args.iter_num))
def train(): learning_rate = cfg.learning_rate image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] if cfg.enable_ce: fluid.default_startup_program().random_seed = 1000 fluid.default_main_program().random_seed = 1000 import random random.seed(0) np.random.seed(0) devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) total_batch_size = devices_num * cfg.TRAIN.im_per_batch use_random = True if cfg.enable_ce: use_random = False model = model_builder.RCNN( add_conv_body_func=resnet.add_ResNet50_conv4_body, add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head, use_pyreader=cfg.use_pyreader, use_random=use_random) model.build_model(image_shape) losses, keys = model.loss() loss = losses[0] fetch_list = losses boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) values = [learning_rate * (gamma**i) for i in range(step_num + 1)] lr = exponential_with_warmup_decay(learning_rate=learning_rate, boundaries=boundaries, values=values, warmup_iter=cfg.warm_up_iter, warmup_factor=cfg.warm_up_factor) optimizer = fluid.optimizer.Momentum( learning_rate=lr, regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum) optimizer.minimize(loss) fetch_list = fetch_list + [lr] fluid.memory_optimize(fluid.default_main_program(), skip_opt_set=set(fetch_list)) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if cfg.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist) if cfg.parallel: train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu), loss_name=loss.name) shuffle = True if cfg.enable_ce: shuffle = False if cfg.use_pyreader: train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch, total_batch_size=total_batch_size, padding_total=cfg.TRAIN.padding_minibatch, shuffle=shuffle) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) else: train_reader = reader.train(batch_size=total_batch_size, shuffle=shuffle) feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) def save_model(postfix): model_path = os.path.join(cfg.model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) fluid.io.save_persistables(exe, model_path) def train_loop_pyreader(): py_reader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() prev_start_time = start_time for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) except (StopIteration, fluid.core.EOFException): py_reader.reset() def train_loop(): start_time = time.time() prev_start_time = start_time start = start_time train_stats = TrainingStats(cfg.log_window, keys) for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) if (iter_id + 1) == cfg.max_iter: break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() # only for ce if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) return np.mean(every_pass_loss) if cfg.use_pyreader: train_loop_pyreader() else: train_loop() save_model('model_final')
def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args): """ do inference with given inference_program """ parallel_executor = fluid.ParallelExecutor(main_program=inference_program, use_cuda=bool(args.use_gpu), loss_name=avg_cost.name) print_para(inference_program, parallel_executor, logger, args) # Use test set as validation each pass total_loss = 0.0 count = 0 n_batch_cnt = 0 n_batch_loss = 0.0 pred_answers, ref_answers = [], [] val_feed_list = [ inference_program.global_block().var(var_name) for var_name in feed_order ] val_feeder = fluid.DataFeeder(val_feed_list, place) pad_id = vocab.get_id(vocab.pad_token) dev_reader = lambda: brc_data.gen_mini_batches( 'dev', args.batch_size, pad_id, shuffle=False) dev_reader = read_multiple(dev_reader, dev_count) for batch_id, batch_list in enumerate(dev_reader(), 1): feed_data = batch_reader(batch_list, args) val_fetch_outs = parallel_executor.run( feed=list(val_feeder.feed_parallel(feed_data, dev_count)), fetch_list=[avg_cost.name, s_probs.name, e_probs.name, match.name], return_numpy=False) total_loss += np.array(val_fetch_outs[0]).sum() start_probs_m = LodTensor_Array(val_fetch_outs[1]) end_probs_m = LodTensor_Array(val_fetch_outs[2]) match_lod = val_fetch_outs[3].lod() count += len(np.array(val_fetch_outs[0])) n_batch_cnt += len(np.array(val_fetch_outs[0])) n_batch_loss += np.array(val_fetch_outs[0]).sum() log_every_n_batch = args.log_interval if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0: logger.info('Average dev loss from batch {} to {} is {}'.format( batch_id - log_every_n_batch + 1, batch_id, "%.10f" % (n_batch_loss / n_batch_cnt))) n_batch_loss = 0.0 n_batch_cnt = 0 batch_offset = 0 for idx, batch in enumerate(batch_list): #one batch batch_size = len(batch['raw_data']) batch_range = match_lod[0][batch_offset:batch_offset + batch_size + 1] batch_lod = [[batch_range[x], batch_range[x + 1]] for x in range(len(batch_range[:-1]))] start_prob_batch = start_probs_m[batch_offset:batch_offset + batch_size + 1] end_prob_batch = end_probs_m[batch_offset:batch_offset + batch_size + 1] for sample, start_prob_inst, end_prob_inst, inst_range in zip( batch['raw_data'], start_prob_batch, end_prob_batch, batch_lod): #one instance inst_lod = match_lod[1][inst_range[0]:inst_range[1] + 1] best_answer, best_span = find_best_answer_for_inst( sample, start_prob_inst, end_prob_inst, inst_lod) pred = { 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [] } pred_answers.append(pred) if 'answers' in sample: ref = { 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': [] } ref_answers.append(ref) batch_offset = batch_offset + batch_size result_dir = args.result_dir result_prefix = args.result_name if result_dir is not None and result_prefix is not None: if not os.path.exists(args.result_dir): os.makedirs(args.result_dir) result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n') logger.info('Saving {} results to {}'.format(result_prefix, result_file)) ave_loss = 1.0 * total_loss / count # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, shuffle=True) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, probs, accuracy, num_seqs = create_model( args, pyreader_name='train_reader', bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ loss.name, probs.name, accuracy.name, num_seqs.name ]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, loss, probs, accuracy, num_seqs = create_model( args, pyreader_name='test_reader', bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_acc, np_num_seqs = outputs else: np_loss, np_acc, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = processor.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, dev_count=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, dev_count=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator(batch_size=args.batch_size, phase='dev', epoch=1, dev_count=1, shuffle=False)) print("Final validation result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, dev_count=1, shuffle=False)) print("Final test result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test")
def train(logger, args): """train a model""" logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: if six.PY2: vocab = pickle.load(fin) else: vocab = pickle.load(fin, encoding='bytes') logger.info('vocab size is {} and embed dim is {}'.format( vocab.size(), vocab.embed_dim)) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.trainset, args.devset) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') if not args.use_gpu: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() # build model main_program = fluid.Program() startup_prog = fluid.Program() if args.enable_ce: main_program.random_seed = args.random_seed startup_prog.random_seed = args.random_seed with fluid.program_guard(main_program, startup_prog): with fluid.unique_name.guard(): avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model( args.hidden_size, vocab, args) # clone from default main program and use it as the validation program inference_program = main_program.clone(for_test=True) # build optimizer if args.optim == 'sgd': optimizer = fluid.optimizer.SGD( learning_rate=args.learning_rate) elif args.optim == 'adam': optimizer = fluid.optimizer.Adam( learning_rate=args.learning_rate) elif args.optim == 'rprop': optimizer = fluid.optimizer.RMSPropOptimizer( learning_rate=args.learning_rate) else: logger.error('Unsupported optimizer: {}'.format(args.optim)) exit(-1) if args.weight_decay > 0.0: obj_func = avg_cost + args.weight_decay * l2_loss(main_program) optimizer.minimize(obj_func) else: obj_func = avg_cost optimizer.minimize(obj_func) # initialize parameters place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = Executor(place) if args.load_dir: logger.info('load from {}'.format(args.load_dir)) fluid.io.load_persistables(exe, args.load_dir, main_program=main_program) else: exe.run(startup_prog) embedding_para = fluid.global_scope().find_var( 'embedding_para').get_tensor() embedding_para.set(vocab.embeddings.astype(np.float32), place) # prepare data feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) logger.info('Training the model...') parallel_executor = fluid.ParallelExecutor( main_program=main_program, use_cuda=bool(args.use_gpu), loss_name=avg_cost.name) print_para(main_program, parallel_executor, logger, args) for pass_id in range(1, args.pass_num + 1): pass_start_time = time.time() pad_id = vocab.get_id(vocab.pad_token) if args.enable_ce: train_reader = lambda: brc_data.gen_mini_batches( 'train', args.batch_size, pad_id, shuffle=False) else: train_reader = lambda: brc_data.gen_mini_batches( 'train', args.batch_size, pad_id, shuffle=True) train_reader = read_multiple(train_reader, dev_count) log_every_n_batch, n_batch_loss = args.log_interval, 0 total_num, total_loss = 0, 0 for batch_id, batch_list in enumerate(train_reader(), 1): feed_data = batch_reader(batch_list, args) fetch_outs = parallel_executor.run( feed=list(feeder.feed_parallel(feed_data, dev_count)), fetch_list=[obj_func.name], return_numpy=False) cost_train = np.array(fetch_outs[0]).mean() total_num += args.batch_size * dev_count n_batch_loss += cost_train total_loss += cost_train * args.batch_size * dev_count if args.enable_ce and batch_id >= 100: break if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0: print_para(main_program, parallel_executor, logger, args) logger.info( 'Average loss from batch {} to {} is {}'.format( batch_id - log_every_n_batch + 1, batch_id, "%.10f" % (n_batch_loss / log_every_n_batch))) n_batch_loss = 0 if args.dev_interval > 0 and batch_id % args.dev_interval == 0: if brc_data.dev_set is not None: eval_loss, bleu_rouge = validation( inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args) logger.info( 'Dev eval result: {}'.format(bleu_rouge)) pass_end_time = time.time() time_consumed = pass_end_time - pass_start_time logger.info('epoch: {0}, epoch_time_cost: {1:.2f}'.format( pass_id, time_consumed)) logger.info( 'Evaluating the model after epoch {}'.format(pass_id)) if brc_data.dev_set is not None: eval_loss, bleu_rouge = validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args) logger.info('Dev eval result: {}'.format(bleu_rouge)) else: logger.warning( 'No dev set is loaded for evaluation in the dataset!') logger.info('Average train loss for epoch {} is {}'.format( pass_id, "%.10f" % (1.0 * total_loss / total_num))) if pass_id % args.save_interval == 0: model_path = os.path.join(args.save_dir, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(executor=exe, dirname=model_path, main_program=main_program) if args.enable_ce: # For CE print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_loss / total_num)) if brc_data.dev_set is not None: print("kpis\ttest_cost_card%d\t%f" % (dev_count, eval_loss)) print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
def train(num_pass=300, use_cuda=False, mem_opt=False): dict_size = 100000 hash_size = 100000 print_iter = 100 eval_iter = 6000 batch_size = 1280 cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) debug = False fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 np.random.seed = 1 # construct network loss, pos_sim, train_program, test_program = net(hash_size=hash_size, dict_size=dict_size) # optimizer = fluid.optimizer.Adam(learning_rate=1e-4) # optimizer = fluid.optimizer.SGD(learning_rate=1e-4) # optimizer.minimize(loss) # memory optimize if mem_opt: fluid.memory_optimize(fluid.default_main_program()) for var in train_program.blocks[0].vars: # if "GRAD" not in var and not train_program.blocks[0].var(var).is_data: if not train_program.blocks[0].var(var).is_data: train_program.blocks[0].var(var).persistable = True print(var, train_program.blocks[0].var(var).persistable, train_program.blocks[0].var(var).shape) # initialize place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) print('startup_program', fluid.default_startup_program()) print('train_program', train_program) # print('test_program', test_program) if debug: var_name_list = ( "cos_sim_1.tmp_0@GRAD", "fc_2.tmp_1@GRAD", "fc_2.tmp_0@GRAD", "softsign_2.tmp_0@GRAD", "reduce_sum_2.tmp_0@GRAD", "stack_2.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD@RENAME@0", "PyramidHash_emb_0@GRAD@RENAME@1", "PyramidHash_emb_0@GRAD@RENAME@2", "PyramidHash_emb_0@GRAD@RENAME@3", "PairwiseMarginLoss_0.tmp_0@GRAD", "cos_sim_1.tmp_0", "cos_sim_1.tmp_0@GRAD", "fc_2.tmp_1@GRAD", "fc_2.tmp_0@GRAD", "softsign_2.tmp_0@GRAD", "reduce_sum_2.tmp_0@GRAD", "stack_2.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD", "FC_1@GRAD", "EmbeddingWithVSum_emb_0@GRAD", "fc_0.w_0@GRAD", "PairwiseMarginLoss_0.tmp_0", "PairwiseMarginLoss_0.tmp_1") # var_name_list = ("sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD@RENAME@0", "PyramidHash_emb_0@GRAD", "FC_1@GRAD", "EmbeddingWithVSum_emb_0@GRAD", "fc_0.w_0@GRAD", "PairwiseMarginLoss_0.tmp_0", "PairwiseMarginLoss_0.tmp_1") for name in var_name_list: train_program.blocks[0].var(name).persistable = True print('find var', name, train_program.blocks[0].var(name).persistable) # PE exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_cuda = use_cuda exec_strategy.allow_op_delay = True exec_strategy.num_threads = 1 # exec_strategy.num_threads = int(os.environ.get('THREAD_NUM', 1)) * cpu_num - 1 # exec_strategy.num_threads = 25 exec_strategy.use_experimental_executor = True build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce # build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # build_strategy.optimize_strategy = fluid.BuildStrategy.OptimizeStrategy.NoLock # pass_builder = build_strategy._create_passes_from_strategy() # pass_builder.insert_pass(0, "lock_free_optimize_pass") train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=None, main_program=train_program, build_strategy=build_strategy, exec_strategy=exec_strategy) test_exe = fluid.ParallelExecutor( use_cuda=use_cuda, main_program=test_program, share_vars_from=train_exe, ) # DataFeeder feed_var_names = [ 'query_basic', 'query_phrase', 'pos_title_basic', 'pos_title_phrase', 'neg_title_basic', 'neg_title_phrase', 'label' ] feed_list = [ train_program.global_block().var(var_name) for var_name in feed_var_names ] feeder = fluid.DataFeeder(feed_list, place) # batch_train_reader = feeder.decorate_reader( # paddle.batch(reader.train_reader, batch_size=batch_size // cpu_num), # multi_devices=true) batch_train_reader = feeder.decorate_reader(paddle.batch( reader.train_reader, batch_size=1280), multi_devices=True) test_feed_var_names = [ 'query_basic', 'query_phrase', 'pos_title_basic', 'pos_title_phrase', 'neg_title_basic', 'neg_title_phrase' ] test_feed_list = [ train_program.global_block().var(var_name) for var_name in test_feed_var_names ] test_feeder = fluid.DataFeeder(test_feed_list, place) # train for epoch in six.moves.xrange(num_pass): count = 0 total_loss = .0 total_time = .0 read_data_start = time.time() for train_data in batch_train_reader(): read_data_end = time.time() # print('read data: ', read_data_end - read_data_start) if count == 1 and epoch >= 1: # if count % eval_iter == 0: print('start eval') t2 = time.time() # with open('./eval_log/train_mini_data_' + str(epoch) + '_' + str(count) + '_' + str(time.time()), 'w') as f: with open( './eval_res/z_' + paddle.version.commit + 'sgd_nolock_result_' + str(epoch) + '_' + str(time.time()), 'w') as f: test_batch_reader = paddle.batch( reader.test_reader, # batch_size=cpu_num * 128) batch_size=1280) for test_data in test_batch_reader(): qids = [] labels = [] data_list = [] for one_data in test_data: qids.append(one_data[0]) labels.append(int(one_data[-1][0])) data_list.append((one_data[1:-1])) predicts = test_exe.run( feed=test_feeder.feed(data_list), fetch_list=[pos_sim.name]) scores = np.array(predicts[0]) for qid, label, score in six.moves.zip( qids, labels, scores): f.write( str(qid) + '\t' + str(score[0]) + '\t' + str(label) + '\n') print('end eval', time.time() - t2) start = time.time() if epoch == 0 and count == 5: profiler.start_profiler("CPU") elif epoch == 0 and count == 10: profiler.stop_profiler("total", "/paddle/Pyramid_DNN/fluid/profile") t1 = time.time() cost = train_exe.run(feed=train_data, fetch_list=[]) total_time += time.time() - t1 # total_loss += np.array(cost[0]).mean() count += 1 if debug: for name in var_name_list: var = np.array( fluid.executor._fetch_var(name, return_numpy=False)) if name == "PyramidHash_emb_0@GRAD@RENAME@0": print('fetch var', name, var) print('check not zero', name, np.count_nonzero(var)) print('fetch var', name, var) print('check nan var', name, np.isnan(var).any()) print('check inf var', name, np.isinf(var).any()) if count % print_iter == 0: print('epoch: %d, batch_id: %d, avg_cost: %s, avg_time: %f' % (epoch, count, total_loss / print_iter, float(total_time) / print_iter)) import sys sys.stdout.flush() total_time = .0 total_loss = .0 read_data_start = time.time()
def fast_infer(args): """ Inference by beam search decoder based solely on Fluid operators. """ out_ids, out_scores, pyreader = fast_decoder( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, InferTaskConfig.beam_size, InferTaskConfig.max_out_len, ModelHyperParams.eos_idx, use_py_reader=args.use_py_reader) # This is used here to set dropout to the test mode. infer_program = fluid.default_main_program().clone(for_test=True) if args.use_mem_opt: fluid.memory_optimize(infer_program) if InferTaskConfig.use_gpu: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=[ var for var in infer_program.list_vars() if isinstance(var, fluid.framework.Parameter) ]) exec_strategy = fluid.ExecutionStrategy() # For faster executor exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 1 build_strategy = fluid.BuildStrategy() infer_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, main_program=infer_program, build_strategy=build_strategy, exec_strategy=exec_strategy) # data reader settings for inference args.train_file_pattern = args.test_file_pattern args.use_token_batch = False args.sort_type = reader.SortType.NONE args.shuffle = False args.shuffle_batch = False test_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader, py_reader_provider_wrapper=py_reader_provider_wrapper, place=place) if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = test_data() trg_idx2word = reader.DataReader.load_dict(dict_path=args.trg_vocab_fpath, reverse=True) while True: try: feed_dict_list = prepare_feed_dict_list(data_generator, dev_count, place) if args.use_parallel_exe: seq_ids, seq_scores = infer_exe.run( fetch_list=[out_ids.name, out_scores.name], feed=feed_dict_list, return_numpy=False) else: seq_ids, seq_scores = exe.run( program=infer_program, fetch_list=[out_ids.name, out_scores.name], feed=feed_dict_list[0] if feed_dict_list is not None else None, return_numpy=False, use_program_cache=True) seq_ids_list, seq_scores_list = [ seq_ids ], [seq_scores] if isinstance( seq_ids, paddle.fluid.LoDTensor) else (seq_ids, seq_scores) for seq_ids, seq_scores in zip(seq_ids_list, seq_scores_list): # How to parse the results: # Suppose the lod of seq_ids is: # [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]] # then from lod[0]: # there are 2 source sentences, beam width is 3. # from lod[1]: # the first source sentence has 3 hyps; the lengths are 12, 12, 16 # the second source sentence has 3 hyps; the lengths are 14, 13, 15 hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)] scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)] for i in range(len(seq_ids.lod()[0]) - 1): # for each source sentence start = seq_ids.lod()[0][i] end = seq_ids.lod()[0][i + 1] for j in range(end - start): # for each candidate sub_start = seq_ids.lod()[1][start + j] sub_end = seq_ids.lod()[1][start + j + 1] hyps[i].append(" ".join([ trg_idx2word[idx] for idx in post_process_seq( np.array(seq_ids)[sub_start:sub_end]) ])) scores[i].append(np.array(seq_scores)[sub_end - 1]) print(hyps[i][-1]) if len(hyps[i]) >= InferTaskConfig.n_best: break except (StopIteration, fluid.core.EOFException): # The data pass is over. if args.use_py_reader: pyreader.reset() break
def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe): """ Helper function that compares calculated backward value is close to dy/dx """ main_program = Program() main_program.random_seed = 123 startup_program = Program() startup_program.random_seed = 123 with program_guard(main_program, startup_program): img = fluid.data(name='image', shape=[-1, 9], dtype='float32') img.stop_gradient = False label = fluid.data(name='label', shape=[-1, 1], dtype='int64') i = fluid.data(name="i", shape=[1], dtype='int32') loss = cond_func(i, img, label) append_backward(loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) num_devices = 1 if use_parallel_exe: os.environ['CPU_NUM'] = str(2) exe = fluid.ParallelExecutor( use_cuda=use_cuda, main_program=main_program, loss_name=loss.name) num_devices = exe.device_count delta = 0.005 for feed_i in range(0, 10): feed_img = np.random.random(size=[1, 9]).astype(np.float32) feed_label = np.random.randint( low=0, high=10, size=[1, 1], dtype=np.int64) if use_parallel_exe: img_grad, loss_value = exe.run( feed={ 'i': np.full((num_devices), feed_i, np.int32), 'image': np.repeat( feed_img, num_devices, axis=0), 'label': np.repeat( feed_label, num_devices, axis=0) }, fetch_list=[img.grad_name, loss.name]) else: img_grad, loss_value = exe.run( main_program, feed={ 'i': np.full((1), feed_i, np.int32), 'image': feed_img, 'label': feed_label }, fetch_list=[img.grad_name, loss.name]) numerical_grad = np.zeros(shape=[num_devices, 9], dtype=np.float32) feed_img_delta = np.copy(feed_img) for j in range(9): feed_img_delta[0][j] = feed_img[0][j] + delta if use_parallel_exe: loss_delta = exe.run(feed={ 'i': np.full((num_devices), feed_i, np.int32), 'image': np.repeat( feed_img_delta, num_devices, axis=0), 'label': np.repeat( feed_label, num_devices, axis=0) }, fetch_list=[loss.name]) multi_device_grad = ( loss_delta[0] - loss_value[0]) / delta / num_devices for d in range(num_devices): numerical_grad[d][j] = multi_device_grad[d] else: loss_delta = exe.run(main_program, feed={ 'i': np.full((1), feed_i, np.int32), 'image': feed_img_delta, 'label': feed_label }, fetch_list=[loss.name]) numerical_grad[0][j] = ( loss_delta[0] - loss_value[0]) / delta feed_img_delta[0][j] = feed_img[0][j] self.assertTrue( np.isclose( img_grad, numerical_grad, atol=0.05, rtol=0.05).all())
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs, input_mask = create_model( pyreader_name='train_reader', bert_config=bert_config, is_training=True) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): start_logits, end_logits, num_seqs, input_mask = create_model( pyreader_name='test_reader', bert_config=bert_config, is_training=False) fluid.memory_optimize(test_prog, skip_opt_set=[ start_logits.name, end_logits.name, num_seqs.name, input_mask.name ]) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() best_f1 = -1 while steps < max_train_steps: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name, input_mask.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if (steps % args.save_steps == 0 or steps == max_train_steps ) and steps > int(max_train_steps / 3.0): if args.do_predict: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) adv_f1 = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor) # print(adv_f1) # continue # if steps != max_train_steps: if adv_f1 > best_f1: best_f1 = adv_f1 save_path = os.path.join(args.checkpoints, "step_best") print("best adv model saved") # else: # save_path = os.path.join(args.checkpoints, # "step_last") fluid.io.save_persistables(exe, save_path, train_program) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict: test_data_generator = processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1) predict(exe, test_prog, test_data_generator(), [ start_logits.name, end_logits.name, num_seqs.name, input_mask.name ], processor)
def test(args): import lib.mpii_reader as reader if args.dataset == 'coco': IMAGE_SIZE = [288, 384] FLIP_PAIRS = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] args.kp_dim = 17 elif args.dataset == 'mpii': IMAGE_SIZE = [384, 384] FLIP_PAIRS = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]] args.kp_dim = 16 else: raise ValueError('The dataset {} is not supported yet.'.format( args.dataset)) print_arguments(args) # Image and target image = layers.data(name='image', shape=[3, IMAGE_SIZE[1], IMAGE_SIZE[0]], dtype='float32') file_id = layers.data(name='file_id', shape=[ 1, ], dtype='int') # Build model model = pose_resnet.ResNet(layers=50, kps_num=args.kp_dim, test_mode=True) # Output output = model.net(input=image, target=None, target_weight=None) if args.with_mem_opt: fluid.memory_optimize(fluid.default_main_program(), skip_opt_set=[output.name]) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if args.checkpoint is not None: fluid.io.load_persistables(exe, args.checkpoint) # Dataloader test_reader = paddle.batch(reader.test(), batch_size=args.batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, file_id]) test_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, main_program=fluid.default_main_program().clone(for_test=True), loss_name=None) fetch_list = [image.name, output.name] for batch_id, data in enumerate(test_reader()): print_immediately("Processing batch #%d" % batch_id) num_images = len(data) file_ids = [] for i in range(num_images): file_ids.append(data[i][1]) input_image, out_heatmaps = test_exe.run(fetch_list=fetch_list, feed=feeder.feed(data)) if args.flip_test: # Flip all the images in a same batch data_fliped = [] for i in range(num_images): data_fliped.append((data[i][0][:, :, ::-1], data[i][1])) # Inference again _, output_flipped = test_exe.run(fetch_list=fetch_list, feed=feeder.feed(data_fliped)) # Flip back output_flipped = flip_back(output_flipped, FLIP_PAIRS) # Feature is not aligned, shift flipped heatmap for higher accuracy if args.shift_heatmap: output_flipped[:, :, :, 1:] = \ output_flipped.copy()[:, :, :, 0:-1] # Aggregate out_heatmaps = (out_heatmaps + output_flipped) * 0.5 save_predict_results(input_image, out_heatmaps, file_ids, fold_name='results')
def train_async(args): # parameters from arguments logging.debug('enter train') model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir startup_prog = fluid.Program() train_prog = fluid.Program() tmp_prog = fluid.Program() train_py_reader, train_cost, global_lr, train_feas, train_label = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_feas, image, label = build_program(is_train=False, main_prog=tmp_prog, startup_prog=startup_prog, args=args) test_prog = tmp_prog.clone(for_test=True) train_fetch_list = [ global_lr.name, train_cost.name, train_feas.name, train_label.name ] test_fetch_list = [test_feas.name] if args.with_mem_opt: fluid.memory_optimize(train_prog, skip_opt_set=set(train_fetch_list)) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) logging.debug('after run startup program') if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, main_program=train_prog, predicate=if_exist) devicenum = get_gpu_num() assert (args.train_batch_size % devicenum) == 0 train_batch_size = args.train_batch_size / devicenum test_batch_size = args.test_batch_size train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False) test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_py_reader.decorate_paddle_reader(train_reader) train_exe = fluid.ParallelExecutor(main_program=train_prog, use_cuda=args.use_gpu, loss_name=train_cost.name) totalruntime = 0 train_py_reader.start() iter_no = 0 train_info = [0, 0, 0] while iter_no <= args.total_iter_num: t1 = time.time() lr, loss, feas, label = train_exe.run(fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 lr = np.mean(np.array(lr)) train_info[0] += np.mean(np.array(loss)) train_info[1] += recall_topk(feas, label, k=1) train_info[2] += 1 if iter_no % args.display_iter_step == 0: avgruntime = totalruntime / args.display_iter_step avg_loss = train_info[0] / train_info[2] avg_recall = train_info[1] / train_info[2] print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\ "recall %.4f, time %2.2f sec" % \ (fmt_time(), iter_no, lr, avg_loss, avg_recall, avgruntime)) sys.stdout.flush() totalruntime = 0 if iter_no % 1000 == 0: train_info = [0, 0, 0] totalruntime += period if iter_no % args.test_iter_step == 0 and iter_no != 0: f, l = [], [] for batch_id, data in enumerate(test_reader()): t1 = time.time() [feas] = exe.run(test_prog, fetch_list=test_fetch_list, feed=test_feeder.feed(data)) label = np.asarray([x[1] for x in data]) f.append(feas) l.append(label) t2 = time.time() period = t2 - t1 if batch_id % 20 == 0: print("[%s] testbatch %d, time %2.2f sec" % \ (fmt_time(), batch_id, period)) f = np.vstack(f) l = np.hstack(l) recall = recall_topk(f, l, k=1) print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \ (fmt_time(), len(f), iter_no, recall)) sys.stdout.flush() if iter_no % args.save_iter_step == 0 and iter_no != 0: model_path = os.path.join(model_save_dir + '/' + model_name, str(iter_no)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog) iter_no += 1
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=dev_count, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() if args.random_seed is not None and args.enable_ce: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_test: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 current_epoch = 0 while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric, is_classify=args.is_classify, is_regression=args.is_regression) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin if args.is_classify: log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) if args.is_regression: log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " " speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], args.skip_steps / used_time)) time_begin = time.time() if nccl2_trainer_id == 0: if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0 or last_epoch != current_epoch: # evaluate dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.enable_ce: card_num = get_cards() ce_loss = 0 ce_acc = 0 ce_time = 0 try: ce_loss = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: log.info("ce info error") log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time)) log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss)) log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc)) # final eval on dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on test set if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.set_batch_generator( reader.data_generator(args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) log.info("Final diagnostic") qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, is_classify=args.is_classify, is_regression=args.is_regression) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds)) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) log.info("Done final diagnostic, saving to {}".format( args.diagnostic_save))
def train_loop(args, logger, vocab, train_progs, infer_progs, optimizer, nccl2_num_trainers=1, nccl2_trainer_id=0, worker_endpoints=None): train_prog, train_startup_prog, train_model = train_progs infer_prog, infer_startup_prog, infer_model = infer_progs # prepare device place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) if not args.use_gpu: place = fluid.CPUPlace() import multiprocessing dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() if args.load_dir: logger.info('load pretrained checkpoints from {}'.format(args.load_dir)) # Todo: why not need to run train_startup_prog before load_persistables fluid.io.load_persistables(exe, args.load_dir, main_program=train_prog) elif args.load_pretraning_params: logger.info('load pretrained params from {}'.format(args.load_pretraning_params)) exe.run(train_startup_prog) init_pretraining_params(exe, args.load_pretraning_params, main_program=train_prog) else: exe.run(train_startup_prog) # prepare data feed_list = [ train_prog.global_block().var(var_name) for var_name in train_model.feed_order ] feeder = fluid.DataFeeder(feed_list, place) logger.info('Training the model...') exe_strategy = fluid.parallel_executor.ExecutionStrategy() if args.para_print: exe_strategy.num_threads = 1 debug_init(train_prog, train_model.grad_vars, train_model.grad_vars_name) with open("program.desc", 'w') as f: print(str(train_prog), file=f) parallel_executor = fluid.ParallelExecutor( loss_name=train_model.loss.name, main_program=train_prog, use_cuda=bool(args.use_gpu), exec_strategy=exe_strategy, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) load_params(train_prog, parallel_executor, place, logger, args) print_para(train_prog, parallel_executor, logger, optimizer, args) logger.info("begin to load data") train_data = data.BidirectionalLMDataset( args.train_path, vocab, test=(not args.shuffle), shuffle_on_load=args.shuffle) logger.info("finished load vocab") # get train epoch size log_interval = args.log_interval total_time = 0.0 batch_size = args.batch_size hidden_size = args.hidden_size custom_samples_array = np.zeros( (batch_size, args.num_steps, args.n_negative_samples_batch + 1), dtype='int64') custom_probabilities_array = np.zeros( (batch_size, args.num_steps, args.n_negative_samples_batch + 1), dtype='float32') for i in range(batch_size): for j in range(0, args.num_steps): for k in range(0, args.n_negative_samples_batch + 1): custom_samples_array[i][j][k] = k custom_probabilities_array[i][j][k] = 1.0 for epoch_id in range(args.max_epoch): start_time = time.time() logger.info("epoch id {}".format(epoch_id)) train_data_iter = lambda: train_data.iter_batches(batch_size * dev_count, args.num_steps) train_reader = read_multiple(train_data_iter, batch_size, dev_count) total_num = 0 n_batch_loss = 0.0 n_batch_cnt = 0 last_hidden_values = np.zeros( (dev_count, args.num_layers * 2 * batch_size * args.embed_size), dtype='float32') last_cell_values = np.zeros( (dev_count, args.num_layers * 2 * batch_size * hidden_size), dtype='float32') begin_time = time.time() for batch_id, batch_list in enumerate(train_reader(), 1): feed_data = batch_reader(batch_list, args) feed = list(feeder.feed_parallel(feed_data, dev_count)) for i in range(dev_count): init_hidden_tensor = fluid.core.LoDTensor() if args.use_gpu: placex = fluid.CUDAPlace(i) else: placex = fluid.CPUPlace() init_hidden_tensor.set(last_hidden_values[i], placex) init_cell_tensor = fluid.core.LoDTensor() init_cell_tensor.set(last_cell_values[i], placex) feed[i]['init_hiddens'] = init_hidden_tensor feed[i]['init_cells'] = init_cell_tensor fetch_outs = parallel_executor.run( feed=feed, fetch_list=[ train_model.loss.name, train_model.last_hidden.name, train_model.last_cell.name ], # + [x[0] for x in names] + [x[0] for x in grad_names], return_numpy=False) cost_train = np.array(fetch_outs[0]).mean() #import pdb; pdb.set_trace() last_hidden_values = np.array(fetch_outs[1]) last_hidden_values = last_hidden_values.reshape( (dev_count, args.num_layers * 2 * batch_size * args.embed_size)) last_cell_values = np.array(fetch_outs[2]) last_cell_values = last_cell_values.reshape(( dev_count, args.num_layers * 2 * batch_size * args.hidden_size)) #vars = fetch_outs[2:2+len(names)] #grad_vars = fetch_outs[2+len(names):] total_num += args.batch_size * dev_count n_batch_loss += np.array(fetch_outs[0]).sum() #logger.info("n_batch_loss from {} to {} is {}, {} ".format( # batch_id - log_interval, batch_id, n_batch_loss, # np.array(fetch_outs[0]).sum())) n_batch_cnt += len(np.array(fetch_outs[0])) if batch_id > 0 and batch_id % log_interval == 0: #vars_print(logger, args, vars=(vars, names), grad_vars=(grad_vars, grad_names)) print_para(train_prog, parallel_executor, logger, optimizer, args) smoothed_ppl = np.exp(n_batch_loss / n_batch_cnt) ppl = np.exp( np.array(fetch_outs[0]).sum() / len(np.array(fetch_outs[0]))) used_time = time.time() - begin_time speed = log_interval / used_time logger.info( "[train] epoch:{}, step:{}, loss:{:.3f}, ppl:{:.3f}, smoothed_ppl:{:.3f}, speed:{:.3f}". format(epoch_id, batch_id, n_batch_loss / n_batch_cnt, ppl, smoothed_ppl, speed)) n_batch_loss = 0.0 n_batch_cnt = 0 begin_time = time.time() if batch_id > 0 and batch_id % args.dev_interval == 0: valid_ppl = eval(vocab, infer_progs, dev_count, logger, args) logger.info("valid ppl {}".format(valid_ppl)) if batch_id > 0 and batch_id % args.save_interval == 0: model_path = os.path.join(args.para_save_dir, str(batch_id + epoch_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables( executor=exe, dirname=model_path, main_program=train_prog) if args.detail and batch_id > 100: exit() end_time = time.time() total_time += end_time - start_time logger.info("train ppl {}".format(ppl)) if epoch_id == args.max_epoch - 1 and args.enable_ce: logger.info("lstm_language_model_duration\t%s" % (total_time / args.max_epoch)) logger.info("lstm_language_model_loss\t%s" % ppl[0]) model_path = os.path.join(args.para_save_dir, str(epoch_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables( executor=exe, dirname=model_path, main_program=train_prog) valid_ppl = eval(vocab, infer_progs, dev_count, logger, args) logger.info("valid ppl {}".format(valid_ppl)) test_ppl = eval(vocab, infer_progs, dev_count, place, logger, args) logger.info("test ppl {}".format(test_ppl))