def run_trainer(self, args): self.lr = args.lr if args.nccl2_reduce_layer_local_run: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, single_device=True) elif args.use_dgc: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc) else: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size) if args.update_method == "pserver": print_to_err( type(self).__name__, "begin to run transpile on trainer with pserver mode") t = self.get_transpiler( trainer_id=args.trainer_id, main_program=fluid.default_main_program(), pserver_endpoints=args.endpoints, trainers=args.trainers, sync_mode=args.sync_mode, dc_asgd=args.dc_asgd, hogwild_mode=args.hogwild) trainer_prog = t.get_trainer_program() print_to_err( type(self).__name__, "get trainer program done with pserver mode.") elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": # transpile for nccl2 config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" config.nccl_comm_num = args.nccl_comm_num if args.use_hallreduce: config.use_hierarchical_allreduce = True config.hierarchical_allreduce_inter_nranks = args.hallreduce_inter_nranks print_to_err( type(self).__name__, "begin to run transpile on trainer with nccl2 mode") nccl2_t = fluid.DistributeTranspiler(config=config) nccl2_t.transpile( args.trainer_id, program=fluid.default_main_program(), startup_program=fluid.default_startup_program(), trainers=args.endpoints, current_endpoint=args.current_endpoint) print_to_err( type(self).__name__, "get trainer program done. with nccl2 mode") trainer_prog = fluid.default_main_program() else: print_to_err( type(self).__name__, "do nothing about main program, just use it") trainer_prog = fluid.default_main_program() print_to_err(type(self).__name__, "use main program done.") if args.use_cuda: device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) else: place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) print_to_err(type(self).__name__, "run worker startup program done.") exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 build_stra = fluid.BuildStrategy() # FIXME force disable enable_inplace and memory_optimize build_stra.enable_inplace = False build_stra.memory_optimize = False if args.hogwild: build_stra.async_mode = True if args.enable_backward_deps: build_stra.enable_backward_optimizer_op_deps = True if args.use_reduce: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce else: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce pass_builder = None if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass") mypass.set("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": build_stra.num_trainers = len(args.endpoints.split(",")) build_stra.trainer_id = args.trainer_id else: # case args.update_method == "nccl2_reduce_layer": build_stra.num_trainers = 1 build_stra.trainer_id = 0 print_to_err(type(self).__name__, "begin to compile with data parallel") binary = compiler.CompiledProgram(trainer_prog).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_stra, exec_strategy=exec_strategy) print_to_err(type(self).__name__, "program compiled with data parallel") feed_var_list = [ var for var in trainer_prog.global_block().vars.values() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) reader_generator = train_reader() def get_data(): origin_batch = next(reader_generator) if args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: new_batch.append(item) return new_batch else: return origin_batch print_to_err(type(self).__name__, "begin to train on trainer") out_losses = [] for i in six.moves.xrange(RUN_STEP): loss, = exe.run(binary, fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) out_losses.append(loss[0]) print_to_err(type(self).__name__, "run step %d finished" % i) print_to_err(type(self).__name__, "trainer run finished") print_to_out(out_losses)
def main(): env = os.environ FLAGS.local_rank = int(env.get('PADDLE_TRAINER_ID', 0)) FLAGS.world_size = int(env.get('PADDLE_TRAINERS_NUM', 1)) FLAGS.device_id = int(env['FLAGS_selected_gpus']) FLAGS.whole_batch_size = FLAGS.world_size * FLAGS.batch_size pipe = HybridTrainPipe() pipe.build() sample_per_shard = len(pipe) // FLAGS.world_size train_loader = DALIClassificationIterator(pipe, reader_name="Reader", fill_last_batch=False) if FLAGS.local_rank == 0: pipe = HybridValPipe() pipe.build() val_loader = DALIClassificationIterator(pipe, reader_name="Reader", fill_last_batch=False) place = fluid.CUDAPlace(FLAGS.device_id) exe = fluid.Executor(place) startup_prog = fluid.Program() train_prog = fluid.Program() eval_prog = fluid.Program() step_per_epoch = int(math.ceil(sample_per_shard / FLAGS.batch_size)) milestones = [step_per_epoch * e for e in (30, 60, 80)] values = [FLAGS.lr * (0.1**i) for i in range(len(milestones) + 1)] with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): train_fetch_list = build() learning_rate = fluid.layers.piecewise_decay( boundaries=milestones, values=values) learning_rate = fluid.layers.linear_lr_warmup( learning_rate=learning_rate, warmup_steps=5 * step_per_epoch, start_lr=0., end_lr=FLAGS.lr) decay = FLAGS.weight_decay optimizer = fluid.optimizer.Momentum( learning_rate=learning_rate, momentum=FLAGS.momentum, regularization=fluid.regularizer.L2Decay(decay)) avg_loss = train_fetch_list[0] optimizer.minimize(avg_loss) with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): eval_fetch_list = build() eval_prog = eval_prog.clone(True) build_strategy = fluid.BuildStrategy() build_strategy.trainer_id = FLAGS.local_rank build_strategy.num_trainers = FLAGS.world_size config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile( FLAGS.local_rank, trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), startup_program=startup_prog, program=train_prog) exec_strategy = fluid.ExecutionStrategy() exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) total_time = AverageMeter() for epoch in range(FLAGS.epochs): if FLAGS.local_rank == 0: print("==== train epoch {:02d} ====".format(epoch + 1)) avg_time, _, _ = run( exe, compiled_train_prog, train_fetch_list, train_loader, epoch) total_time.update(avg_time) # reset DALI iterators train_loader.reset() if FLAGS.local_rank == 0: print("==== validation epoch {:02d} ====".format(epoch + 1)) _, prec1, prec5 = run( exe, compiled_eval_prog, eval_fetch_list, val_loader, epoch) val_loader.reset() ckpt_path = os.path.join('checkpoint', "{:02d}".format(epoch + 1)) if os.path.isdir(ckpt_path): shutil.rmtree(ckpt_path) print('Save model to {}.'.format(ckpt_path)) fluid.io.save_persistables(exe, ckpt_path, train_prog) time_per_sample = FLAGS.whole_batch_size / total_time.avg if epoch == FLAGS.epochs-1: print('##Top-1 {0}\n' '##Top-5 {1}\n' '##Perf {2}'.format( prec1 * 100, prec5 * 100, time_per_sample))
def main(args): """main""" model_config = UNIMOConfig(args.unimo_config_path) model_config.print_config() gpu_id = 0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None: gpu_list = os.getenv("FLAGS_selected_gpus").split(",") gpus = len(gpu_list) gpu_id = int(gpu_list[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = gpus else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file, encoder_json_file=args.encoder_json_file, vocab_bpe_file=args.vocab_bpe_file, do_lower_case=args.do_lower_case) if not (args.do_train or args.do_val or args.do_test or args.do_test_hard): raise ValueError( "For args `do_train`, `do_val`, `do_test`, `do_test_hard`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) if args.do_train: train_data_reader = ClassifyReader(args.train_filelist, args.max_seq_len, tokenizer) train_data_generator = train_data_reader.data_generator( batch_size=args.batch_size, epoch=args.epoch, phase="train") if args.num_train_examples: num_train_examples = args.num_train_examples else: num_train_examples = train_data_reader.get_num_examples() step_num_per_epoch = num_train_examples // args.batch_size // trainers_num max_train_steps = args.epoch * step_num_per_epoch warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id)) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, config=model_config, pyreader_name="train_reader", is_train=True) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon) if args.do_val or args.do_test or args.do_test_hard: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = create_model( args, config=model_config, pyreader_name="dev_reader", is_train=False) test_prog = test_prog.clone(for_test=True) if args.do_val: dev_data_reader = ClassifyReader(args.dev_filelist, args.max_seq_len, tokenizer) dev_data_generator = dev_data_reader.data_generator( batch_size=args.test_batch_size, epoch=1, phase="dev") if args.do_test: test_data_reader = ClassifyReader(args.test_filelist, args.max_seq_len, tokenizer) test_data_generator = test_data_reader.data_generator( batch_size=args.test_batch_size, epoch=1, phase="test") if args.do_test_hard: test_hard_data_reader = ClassifyReader(args.test_hard_filelist, args.max_seq_len, tokenizer) test_hard_data_generator = test_hard_data_reader.data_generator( batch_size=args.test_batch_size, epoch=1, phase="test_hard") nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" if args.nccl_comm_num > 1: config.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks: config.use_hierarchical_allreduce = args.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks assert config.hierarchical_allreduce_inter_nranks > 1 assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0 config.hierarchical_allreduce_exter_nranks = \ trainers_num / config.hierarchical_allreduce_inter_nranks t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=train_program) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=train_program) elif args.do_val or args.do_test or args.do_test_hard: args.init_checkpoint = args.init_pretraining_params if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 if args.use_fp16 else 2 exec_strategy.num_iteration_per_drop_scope = min( args.num_iteration_per_drop_scope, args.skip_steps) build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False if args.use_fuse: build_strategy.fuse_all_reduce_ops = True train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test or args.do_test_hard: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) dev_ret_history = [] # (steps, key_eval, eval) test_ret_history = [] # (steps, key_eval, eval) test_hard_ret_history = [] # (steps, key_eval, eval) steps = 0 if args.do_train: train_pyreader.start() time_begin = time.time() skip_steps = args.skip_steps while True: try: steps += 1 if steps % skip_steps == 0: train_fetch_list = [ graph_vars["loss"].name, scheduled_lr.name ] res = train_exe.run(fetch_list=train_fetch_list) outputs = { "loss": np.mean(res[0]), 'learning_rate': float(res[1][0]) } if args.verbose: verbose = "train pyreader queue size: %d, learning_rate: %.10f" % \ (train_pyreader.queue.size(), outputs['learning_rate']) print(verbose) current_epoch, current_example, current_file_index, total_file, current_file = \ train_data_reader.get_progress() time_end = time.time() used_time = time_end - time_begin print("%s - epoch: %d, progress: %d/%d, %d/%d, step: %d, ave loss: %f, speed: %f steps/s" % \ (get_time(), current_epoch, current_example, num_train_examples, current_file_index, \ total_file, steps, outputs["loss"], args.skip_steps / used_time)) time_begin = time.time() else: train_exe.run(fetch_list=[]) if nccl2_trainer_id == 0: if steps % args.save_steps == 0 and args.save_checkpoints: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( dev_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \ "dev", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: dev_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( test_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \ "test", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: test_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # evaluate test set if args.do_test_hard: test_pyreader.decorate_tensor_provider( test_hard_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \ "test_hard", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: test_hard_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) except fluid.core.EOFException: if args.save_checkpoints: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider(dev_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, "dev", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: dev_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider(test_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, "test", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: test_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # final eval on test_hard set if args.do_test_hard: test_pyreader.decorate_tensor_provider(test_hard_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, "test_hard", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: test_hard_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) if nccl2_trainer_id == 0: if args.do_val: dev_ret_history = sorted(dev_ret_history, key=lambda a: a[2], reverse=True) print("Best validation result: step %d %s %f" % \ (dev_ret_history[0][0], dev_ret_history[0][1], dev_ret_history[0][2]))
def train(): logger = logging.getLogger("lm") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) args = parse_args() logger.info('Running with args : {}'.format(args)) logger.info('Running paddle : {}'.format(paddle.version.commit)) hidden_size = args.hidden_size batch_size = args.batch_size data_path = args.data_path logger.info("begin to load vocab") vocab = data.Vocabulary(args.vocab_path, validate_file=True) vocab_size = vocab.size logger.info("finished load vocab") if args.enable_ce: random.seed(args.random_seed) np.random.seed(args.random_seed) logger.info('build the model...') # build model train_prog = fluid.Program() train_startup_prog = fluid.Program() if args.enable_ce: train_prog.random_seed = args.random_seed train_startup_prog.random_seed = args.random_seed # build infer model infer_prog = fluid.Program() infer_startup_prog = fluid.Program() with fluid.program_guard(infer_prog, infer_startup_prog): with fluid.unique_name.guard(): # Infer process infer_model = lm_model.LanguageModel( args, vocab_size, test_mode=True) infer_model.build() infer_progs = infer_prog, infer_startup_prog, infer_model with fluid.program_guard(train_prog, train_startup_prog): with fluid.unique_name.guard(): # Training process train_model = lm_model.LanguageModel( args, vocab_size, test_mode=False) train_model.build() fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=args.max_grad_norm)) # build optimizer if args.optim == 'adagrad': optimizer = fluid.optimizer.Adagrad( learning_rate=args.learning_rate, epsilon=0.0, initial_accumulator_value=1.0) elif args.optim == 'sgd': optimizer = fluid.optimizer.SGD( learning_rate=args.learning_rate) elif args.optim == 'adam': optimizer = fluid.optimizer.Adam( learning_rate=args.learning_rate) elif args.optim == 'rprop': optimizer = fluid.optimizer.RMSPropOptimizer( learning_rate=args.learning_rate) else: logger.error('Unsupported optimizer: {}'.format(args.optim)) exit(-1) optimizer.minimize(train_model.loss * args.num_steps) # initialize parameters place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) train_progs = train_prog, train_startup_prog, train_model if args.local: logger.info("local start_up:") train_loop(args, logger, vocab, train_progs, infer_progs, optimizer) else: if args.update_method == "nccl2": trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) if args.test_nccl: worker_endpoints_env = os.getenv("PADDLE_WORK_ENDPOINTS") worker_endpoints = worker_endpoints_env.split(',') trainers_num = len(worker_endpoints) current_endpoint = worker_endpoints[trainer_id] else: port = os.getenv("PADDLE_PORT") worker_ips = os.getenv("PADDLE_TRAINERS") worker_endpoints = [] for ip in worker_ips.split(","): worker_endpoints.append(':'.join([ip, port])) worker_endpoints_env = ','.join(worker_endpoints) trainers_num = len(worker_endpoints) current_endpoint = os.getenv("POD_IP") + ":" + port if trainer_id == 0: logger.info("train_id == 0, sleep 60s") time.sleep(60) logger.info("trainers_num:{}".format(trainers_num)) logger.info("worker_endpoints:{}".format(worker_endpoints)) logger.info("current_endpoint:{}".format(current_endpoint)) config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_prog, startup_program=train_startup_prog) train_progs = train_prog, train_startup_prog, train_model train_loop(args, logger, vocab, train_progs, infer_progs, optimizer, trainers_num, trainer_id, worker_endpoints) else: port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) logger.info("pserver_endpoints:{}".format(pserver_endpoints)) logger.info("current_endpoint:{}".format(current_endpoint)) logger.info("trainer_id:{}".format(trainer_id)) logger.info("pserver_ips:{}".format(pserver_ips)) logger.info("port:{}".format(port)) t = fluid.DistributeTranspiler() t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers, program=train_prog, startup_program=startup_prog) if training_role == "PSERVER": logger.info("distributed: pserver started") current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: logger.critical("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": logger.info("distributed: trainer started") trainer_prog = t.get_trainer_program() train_loop(args, logger, vocab, train_progs, infer_progs, optimizer) else: logger.critical( "environment var TRAINER_ROLE should be TRAINER os PSERVER") exit(1)
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) reader = task_reader.SequenceLabelReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: fetch_list = [ graph_vars["num_infer"].name, graph_vars["num_label"].name, graph_vars["num_correct"].name, graph_vars["loss"].name, graph_vars['learning_rate'].name, ] out = train_exe.run(fetch_list=fetch_list) num_infer, num_label, num_correct, np_loss, np_lr = out lr = float(np_lr[0]) loss = np_loss.mean() precision, recall, f1 = calculate_f1( num_label, num_infer, num_correct) if args.verbose: log.info( "train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(), lr if warmup_steps > 0 else args.learning_rate)) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin log.info( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "f1: %f, precision: %f, recall: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, loss, f1, precision, recall, args.skip_steps / used_time)) time_begin = time.time() if nccl2_trainer_id == 0 and steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if nccl2_trainer_id == 0 and steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # evaluate test set if args.do_test: predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if nccl2_trainer_id == 0 and args.do_val: if not args.do_train: current_example, current_epoch = reader.get_train_progress() evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, 'final') if nccl2_trainer_id == 0 and args.do_test: if not args.do_train: current_example, current_epoch = reader.get_train_progress() predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, 'final')
def main(args): """main""" model_config = UNIMOConfig(args.unimo_config_path) model_config.print_config() gpu_id = 0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None: gpu_list = os.getenv("FLAGS_selected_gpus").split(",") gpus = len(gpu_list) gpu_id = int(gpu_list[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = gpus else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file, encoder_json_file=args.encoder_json_file, vocab_bpe_file=args.vocab_bpe_file, do_lower_case=args.do_lower_case) data_reader = ClassifyReader(tokenizer, args) if not (args.do_train or args.do_val or args.do_val_hard \ or args.do_test or args.do_test_hard or args.do_diagnostic): raise ValueError("For args `do_train`, `do_val`, `do_val_hard`, `do_test`," \ " `do_test_hard` and `do_diagnostic`, at least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) train_data_generator = data_reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=trainers_num, shuffle=True, phase="train") num_train_examples = data_reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // trainers_num else: max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id)) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', config=model_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_val_hard or args.do_test or args.do_test_hard \ or args.do_pred or args.do_pred_hard or args.do_diagnostic: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', config=model_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" if args.nccl_comm_num > 1: config.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks: config.use_hierarchical_allreduce = args.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks assert config.hierarchical_allreduce_inter_nranks > 1 assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0 config.hierarchical_allreduce_exter_nranks = \ trainers_num / config.hierarchical_allreduce_inter_nranks t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=train_program) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=train_program) elif args.do_val or args.do_val_hard or args.do_test or args.do_test_hard \ or args.do_pred or args.do_pred_hard or args.do_diagnostic: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_val_hard or args.do_test or args.do_test_hard \ or args.do_pred or args.do_pred_hard or args.do_diagnostic: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) dev_ret_history = [] # (steps, key_eval, eval) dev_hard_ret_history = [] # (steps, key_eval, eval) test_ret_history = [] # (steps, key_eval, eval) test_hard_ret_history = [] # (steps, key_eval, eval) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() skip_steps = args.skip_steps while True: try: steps += 1 if steps % skip_steps == 0: train_fetch_list = [ graph_vars["loss"].name, graph_vars["accuracy"].name, graph_vars["num_seqs"].name ] if "learning_rate" in graph_vars: train_fetch_list.append( graph_vars["learning_rate"].name) res = train_exe.run(fetch_list=train_fetch_list) outputs = {"loss": np.mean(res[0])} if "learning_rate" in graph_vars: outputs["learning_rate"] = float(res[3][0]) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = data_reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print("%s - epoch: %d, progress: %d/%d, step: %d, ave loss: %f, speed: %f steps/s" % \ (get_time(), current_epoch, current_example, num_train_examples, steps, \ outputs["loss"], args.skip_steps / used_time)) time_begin = time.time() else: train_exe.run(fetch_list=[]) if nccl2_trainer_id == 0: if steps % args.save_steps == 0 and args.save_checkpoints: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "dev") dev_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # evaluate dev_hard set if args.do_val_hard: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.dev_hard_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "dev_hard") dev_hard_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "test") test_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # evaluate test_hard set if args.do_test_hard: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.test_hard_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "test_hard") test_hard_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # pred diagnostic set if args.do_diagnostic: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.diagnostic_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1) save_path = args.pred_save + '.diagnostic.' + str( steps) + '.txt' print("testing {}, save to {}".format( args.diagnostic_set, save_path)) with open(save_path, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) # pred test set if args.do_pred: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1) save_path = args.pred_save + '.test.' + str( steps) + '.txt' print("testing {}, save to {}".format( args.test_set, save_path)) with open(save_path, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) # pred test hard set if args.do_pred_hard: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.test_hard_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1) save_path = args.pred_save + '.test_hard.' + str( steps) + '.txt' print("testing {}, save to {}".format( args.test_hard_set, save_path)) with open(save_path, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) except fluid.core.EOFException: if args.save_checkpoints: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if nccl2_trainer_id == 0: # final pred on diagnostic set if args.do_diagnostic: test_pyreader.decorate_tensor_provider( data_reader.data_generator(args.diagnostic_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1) save_path = args.pred_save + '.diagnostic.' + str(steps) + '.txt' print("testing {}, save to {}".format(args.diagnostic_set, save_path)) with open(save_path, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) # final pred on test set if args.do_pred: test_pyreader.decorate_tensor_provider( data_reader.data_generator(args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1) save_path = args.pred_save + '.test.' + str(steps) + '.txt' print("testing {}, save to {}".format(args.test_set, save_path)) with open(save_path, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) # final pred on test_hard set if args.do_pred_hard: test_pyreader.decorate_tensor_provider( data_reader.data_generator(args.test_hard_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1) save_path = args.pred_save + '.test_hard.' + str(steps) + '.txt' print("testing {}, save to {}".format(args.test_hard_set, save_path)) with open(save_path, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( data_reader.data_generator(args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) print("Final test result:") outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "test") test_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) test_ret_history = sorted(test_ret_history, key=lambda a: a[2], reverse=True) print("Best testing result: step %d %s %f" % (test_ret_history[0][0], test_ret_history[0][1], test_ret_history[0][2])) # final eval on test hard set if args.do_test_hard: test_pyreader.decorate_tensor_provider( data_reader.data_generator(args.test_hard_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) print("Final test_hard result:") outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "test_hard") test_hard_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) test_hard_ret_history = sorted(test_hard_ret_history, key=lambda a: a[2], reverse=True) print("Best testing hard result: step %d %s %f" % (test_hard_ret_history[0][0], test_hard_ret_history[0][1], test_hard_ret_history[0][2])) # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( data_reader.data_generator(args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) print("Final validation result:") outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "dev") dev_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) dev_ret_history = sorted(dev_ret_history, key=lambda a: a[2], reverse=True) print("Best validation result: step %d %s %f" % (dev_ret_history[0][0], dev_ret_history[0][1], dev_ret_history[0][2])) # final eval on dev hard set if args.do_val_hard: test_pyreader.decorate_tensor_provider( data_reader.data_generator(args.dev_hard_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) print("Final validation_hard result:") outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "dev_hard") dev_hard_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) dev_hard_ret_history = sorted(dev_hard_ret_history, key=lambda a: a[2], reverse=True) print("Best validation_hard result: step %d %s %f" % (dev_hard_ret_history[0][0], dev_hard_ret_history[0][1], dev_hard_ret_history[0][2]))
def train(args): """ train """ is_local = os.getenv("PADDLE_IS_LOCAL", "1") if is_local == '0': args.local = False print(args) if args.device == 'CPU': TrainTaskConfig.use_gpu = False training_role = os.getenv("TRAINING_ROLE", "TRAINER") gpus = os.getenv("FLAGS_selected_gpus").split(",") gpu_id = int(gpus[0]) if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu): place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(gpu_id) dev_count = len(gpus) exe = fluid.Executor(place) train_prog = fluid.Program() startup_prog = fluid.Program() if args.enable_ce: train_prog.random_seed = 1000 startup_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): logits_list = [] data_input_names = encoder_data_input_fields + \ decoder_data_input_fields[:-1] + label_data_input_fields + dense_bias_input_fields all_data_inputs, pyreader = make_all_py_reader_inputs(data_input_names) with fluid.unique_name.guard("new_forward"): new_forward_sum_cost, new_forward_avg_cost, new_forward_token_num, new_forward_logits, new_forward_xent, new_forward_loss, new_forward_label, new_forward_non_zeros = forward_transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 50, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, ModelHyperParams.embedding_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=True, is_test=False, params_type="new", all_data_inputs=all_data_inputs) with fluid.unique_name.guard("new_relative_position"): new_relative_position_sum_cost, new_relative_position_avg_cost, new_relative_position_token_num, new_relative_position_logits, new_relative_position_xent, new_relative_position_loss, new_relative_position_label, new_relative_position_non_zeros = relative_transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 50, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, ModelHyperParams.embedding_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=False, params_type="new", all_data_inputs=all_data_inputs) DenseModelHyperParams.src_vocab_size = ModelHyperParams.src_vocab_size DenseModelHyperParams.trg_vocab_size = ModelHyperParams.trg_vocab_size DenseModelHyperParams.weight_sharing = ModelHyperParams.weight_sharing DenseModelHyperParams.embedding_sharing = ModelHyperParams.embedding_sharing with fluid.unique_name.guard("new_dense"): new_dense_sum_cost, new_dense_avg_cost, new_dense_token_num, new_dense_logits, new_dense_xent, new_dense_loss, new_dense_label, _ = dense_transformer( DenseModelHyperParams.src_vocab_size, DenseModelHyperParams.trg_vocab_size, DenseModelHyperParams.max_length + 50, DenseModelHyperParams.n_layer, DenseModelHyperParams.enc_n_layer, DenseModelHyperParams.n_head, DenseModelHyperParams.d_key, DenseModelHyperParams.d_value, DenseModelHyperParams.d_model, DenseModelHyperParams.d_inner_hid, DenseModelHyperParams.prepostprocess_dropout, DenseModelHyperParams.attention_dropout, DenseModelHyperParams.relu_dropout, DenseModelHyperParams.preprocess_cmd, DenseModelHyperParams.postprocess_cmd, DenseModelHyperParams.weight_sharing, DenseModelHyperParams.embedding_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=False, params_type="new", all_data_inputs=all_data_inputs) with fluid.unique_name.guard("fixed_forward"): fixed_forward_sum_cost, fixed_forward_avg_cost, fixed_forward_token_num, fixed_forward_logits, fixed_forward_xent, fixed_forward_loss, fixed_forward_label, fixed_forward_non_zeros = forward_transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 50, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, ModelHyperParams.embedding_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=False, params_type="fixed", all_data_inputs=all_data_inputs) logits_list.append(fixed_forward_logits) DenseModelHyperParams.src_vocab_size = ModelHyperParams.src_vocab_size DenseModelHyperParams.trg_vocab_size = ModelHyperParams.trg_vocab_size DenseModelHyperParams.weight_sharing = ModelHyperParams.weight_sharing DenseModelHyperParams.embedding_sharing = ModelHyperParams.embedding_sharing with fluid.unique_name.guard("fixed_dense"): fixed_dense_sum_cost, fixed_dense_avg_cost, fixed_dense_token_num, fixed_dense_logits, fixed_dense_xent, fixed_dense_loss, fixed_dense_label, _ = dense_transformer( DenseModelHyperParams.src_vocab_size, DenseModelHyperParams.trg_vocab_size, DenseModelHyperParams.max_length + 50, DenseModelHyperParams.n_layer, DenseModelHyperParams.enc_n_layer, DenseModelHyperParams.n_head, DenseModelHyperParams.d_key, DenseModelHyperParams.d_value, DenseModelHyperParams.d_model, DenseModelHyperParams.d_inner_hid, DenseModelHyperParams.prepostprocess_dropout, DenseModelHyperParams.attention_dropout, DenseModelHyperParams.relu_dropout, DenseModelHyperParams.preprocess_cmd, DenseModelHyperParams.postprocess_cmd, DenseModelHyperParams.weight_sharing, DenseModelHyperParams.embedding_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=False, params_type="fixed", all_data_inputs=all_data_inputs) logits_list.append(fixed_dense_logits) with fluid.unique_name.guard("fixed_relative_position"): fixed_relative_sum_cost, fixed_relative_avg_cost, fixed_relative_token_num, fixed_relative_logits, fixed_relative_xent, fixed_relative_loss, fixed_relative_label, _ = relative_transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 50, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, ModelHyperParams.embedding_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=False, params_type="fixed", all_data_inputs=all_data_inputs) logits_list.append(fixed_relative_logits) # normalizing confidence = 1.0 - TrainTaskConfig.label_smooth_eps low_confidence = (1.0 - confidence) / (ModelHyperParams.trg_vocab_size - 1) normalizing = -(confidence * math.log(confidence) + (ModelHyperParams.trg_vocab_size - 1) * low_confidence * math.log(low_confidence + 1e-20)) batch_size = layers.shape(new_forward_logits)[0] seq_length = layers.shape(new_forward_logits)[1] trg_voc_size = layers.shape(new_forward_logits)[2] # ensemble teacher_logits = logits_list[0] for index in xrange(1, len(logits_list)): teacher_logits += logits_list[index] teacher_logits = teacher_logits / len(logits_list) # new_target new_target = layers.softmax(teacher_logits) new_target.stop_gradient = True # agent_1: forward fdistill_xent = layers.softmax_with_cross_entropy( logits=new_forward_logits, label=new_target, soft_label=True) fdistill_xent -= normalizing fdistill_loss = layers.reduce_sum( fdistill_xent * new_forward_non_zeros) / new_forward_token_num # agent_2: relative rdistill_xent = layers.softmax_with_cross_entropy( logits=new_relative_position_logits, label=new_target, soft_label=True) rdistill_xent -= normalizing rdistill_loss = layers.reduce_sum( rdistill_xent * new_forward_non_zeros) / new_forward_token_num # agent_3: dense ddistill_xent = layers.softmax_with_cross_entropy( logits=new_dense_logits, label=new_target, soft_label=True) ddistill_xent -= normalizing ddistill_loss = layers.reduce_sum( ddistill_xent * new_forward_non_zeros) / new_forward_token_num teacher_loss = fixed_forward_avg_cost + fixed_dense_avg_cost + fixed_relative_avg_cost avg_cost = TrainTaskConfig.beta * new_forward_avg_cost + ( 1.0 - TrainTaskConfig.beta ) * fdistill_loss + TrainTaskConfig.beta * new_relative_position_avg_cost + ( 1.0 - TrainTaskConfig.beta ) * rdistill_loss + TrainTaskConfig.beta * new_dense_avg_cost + ( 1.0 - TrainTaskConfig.beta) * ddistill_loss + teacher_loss avg_cost.persistable = True teacher_loss.persistable = True optimizer = None if args.sync: lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) logging.info("before adam") with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) else: optimizer = fluid.optimizer.SGD(0.003) if args.use_fp16: #black_varnames={"src_slf_attn_bias", "trg_slf_attn_bias", "trg_src_attn_bias", "dense_src_slf_attn_bias", "dense_trg_slf_attn_bias", "dense_trg_src_attn_bias"} #amp_lists=fluid.contrib.mixed_precision.AutoMixedPrecisionLists(custom_black_varnames=black_varnames, # custom_black_list=["dropout"]) #optimizer = fluid.contrib.mixed_precision.decorate(optimizer, amp_lists=amp_lists, optimizer = fluid.contrib.mixed_precision.decorate( optimizer, init_loss_scaling=32768, incr_every_n_steps=2000, use_dynamic_loss_scaling=True) optimizer.minimize(avg_cost) loss_scaling = None scaled_cost = None if args.use_fp16: scaled_cost = optimizer.get_scaled_loss() loss_scaling = optimizer.get_loss_scaling() if args.local: logging.info("local start_up:") train_loop(exe, train_prog, startup_prog, args, dev_count, avg_cost, teacher_loss, new_relative_position_sum_cost, new_relative_position_avg_cost, new_relative_position_token_num, pyreader, place) else: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) logging.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" if args.nccl_comm_num > 1: config.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks: logging.info("use_hierarchical_allreduce") config.use_hierarchical_allreduce = args.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = 8 if config.hierarchical_allreduce_inter_nranks > 1: config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks assert config.hierarchical_allreduce_inter_nranks > 1 assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0 config.hierarchical_allreduce_exter_nranks = \ trainers_num / config.hierarchical_allreduce_inter_nranks t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_prog, startup_program=startup_prog) train_loop(exe, train_prog, startup_prog, args, dev_count, avg_cost, teacher_loss, new_relative_position_sum_cost, new_relative_position_avg_cost, new_relative_position_token_num, pyreader, place, trainers_num, trainer_id, scaled_cost=scaled_cost, loss_scaling=loss_scaling)
def main(args): """ Main func for downstream tasks """ print("finetuning tasks start") ernie_config = ErnieVilConfig(args.ernie_config_path) ernie_config.print_config() with open(args.task_group_json) as f: task_group = json.load(f) print('task: ', task_group) startup_prog = fluid.Program() if args.do_train and args.do_test: print("can not set both do_train and do_test as True") return model_name = MODELS[args.task_name] if args.do_train: train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, model_outputs = model_name( pyreader_name='train_reader', ernie_config=ernie_config, task_group=task_group) total_loss = model_outputs[0] scheduled_lr = get_optimizer(total_loss, train_program, startup_prog, args) if args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, model_outputs = model_name( pyreader_name='test_reader', ernie_config=ernie_config, task_group=task_group, is_prediction=True) total_loss = model_outputs[0] test_prog = test_prog.clone(for_test=True) if args.use_gpu: gpu_id = 0 if os.getenv("FLAGS_selected_gpus"): gpu_id = int(os.getenv("FLAGS_selected_gpus")) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() print("theoretical memory usage: ") if args.do_train: print( fluid.contrib.memory_usage(program=train_program, batch_size=args.batch_size)) if args.do_test: print( fluid.contrib.memory_usage(program=test_prog, batch_size=args.batch_size)) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" if args.nccl_comm_num > 1: config.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks: config.use_hierarchical_allreduce = args.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks assert config.hierarchical_allreduce_inter_nranks > 1 assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0 config.hierarchical_allreduce_exter_nranks = \ trainers_num / config.hierarchical_allreduce_inter_nranks t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_checkpoint != "": sys.stderr.write( '############################WARNING############################' ) sys.stderr.write( '####### using init_pretraining_params, not init_checkpoint ####' ) sys.stderr.write( '## meaning hyper param e.g. lr won\'t inherit from checkpoint##' ) sys.stderr.write( '###############################################################' ) init_pretraining_params(exe, args.init_checkpoint, train_program) reader_name = READERS[args.task_name] data_reader = reader_name( task_group, split=args.split, vocab_path=args.vocab_path, batch_size=args.batch_size, epoch=args.epoch, ) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 2 exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps) build_strategy = fluid.compiler.BuildStrategy() build_strategy.fuse_all_reduce_ops = False if args.use_fuse: build_strategy.fuse_all_reduce_ops = True if args.do_train: train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=total_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) if args.do_test: predict = predict_wrapper(args, exe, ernie_config, task_group, test_prog=test_prog, pyreader=test_pyreader, graph_vars=model_outputs) result = predict() if args.do_train: train_pyreader.decorate_tensor_provider(data_reader.data_generator()) train_pyreader.start() # For testing purposes preds = [] targets = [] steps = 0 time_begin = time.time() node_nums = 1 #int(os.getenv("PADDLE_NODES_NUM")) used_time_all = 0 while steps < args.num_train_steps: try: steps += node_nums skip_steps = args.skip_steps * node_nums fetch_list = [] if nccl2_trainer_id == 0 and steps % skip_steps == 0: task_name_list = [v.name for v in model_outputs] fetch_list = task_name_list fetch_list.append(scheduled_lr.name) time_begin = time.time() outputs = train_exe.run(fetch_list=fetch_list) if outputs: print("feed_queue size", train_pyreader.queue.size()) progress_file = data_reader.get_progress() epoch = progress_file["current_epoch"] current_file_index = progress_file["current_file_index"] total_file = progress_file["total_file"] current_file = progress_file["current_file"] print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "acc: %f" % (epoch, current_file_index, total_file, steps, outputs[0][0], outputs[1][0])) print("steps:", steps) print("save_steps:", args.save_steps) # For Validation & testing purposes preds.append(outputs[2][0]) targets.append(outputs[3][0]) if steps % 500 == 0: print("Train-RCAC", roc_auc_score(targets, preds)) preds = [] targets = [] np_lr = outputs[-1:] date_str = datetime.datetime.now().strftime( "%Y%m%d %H:%M:%S") np_lr = float(np.mean(np_lr[0])) print("%s current learning_rate:%.8f" % (date_str, np_lr)) if steps % args.save_steps == 0: save_path = os.path.join( args.checkpoints, "step_" + str(steps) + str(args.split)) print("save_path:", save_path) fluid.io.save_persistables(exe, save_path, train_program) time_end = time.time() used_time = time_end - time_begin time_end = time_begin print("used_time:", used_time) if steps == args.stop_steps: break except fluid.core.EOFException: train_pyreader.reset() break
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) if args.task_type == "dialog": ernie_config["role_type_size"] = args.role_type_size ernie_config["turn_type_size"] = args.turn_type_size if args.hidden_dropout_prob >= 0: ernie_config["hidden_dropout_prob"] = args.hidden_dropout_prob if args.attention_probs_dropout_prob >= 0: ernie_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob ernie_config.print_config() if args.pred_batch_size <= 0: args.pred_batch_size = args.batch_size gpu_id = 0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed: gpus = os.getenv("FLAGS_selected_gpus").split(",") gpu_id = int(gpus[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = len(gpus) if args.is_distributed else gpus else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) reader = Seq2SeqReader(args) ernie_gen = ErnieGenFinetune(args, ernie_config, reader.tokenizer) if not (args.do_train or args.do_val or args.do_test or args.do_pred): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM")) train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=trainers_num, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // trainers_num else: max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id)) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = ernie_gen.create_model() scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test or args.do_pred: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = ernie_gen.create_model( decoding=args.do_decode) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) init_model(args, exe, startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.set_batch_generator(train_data_generator) train_resource = { "exe": train_exe, "program": train_program, "pyreader": train_pyreader } save_model = partial(save_checkpoint, program=train_program, exe=exe) test_dev_count = 1 if args.do_val or args.do_test or args.do_pred: test_exe = exe if args.use_multi_gpu_test: test_dev_count = min(trainers_num, int(os.getenv("PADDLE_PROC_PER_NODE", "1"))) test_resource = { "exe": test_exe, "program": test_prog, "pyreader": test_pyreader } eval_data_generator = partial(reader.data_generator, batch_size=args.pred_batch_size, epoch=1, dev_count=test_dev_count, shuffle=False, do_decode=args.do_decode, place=place) eval_func = partial(ernie_gen.evaluate, resource=test_resource, graph_vars=test_graph_vars, dev_count=test_dev_count, output_path=args.checkpoints, gpu_id=trainer_id) evaluate = partial(evaluate_datasets, pyreader=test_pyreader, reader=reader, eval_func=eval_func, data_generator=eval_data_generator) if args.do_train: train_pyreader.start() steps = 0 last_epoch = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() skip_steps = args.skip_steps while True: try: steps += 1 if args.save_and_valid_by_epoch: suffix = "epoch_" + str(last_epoch) else: suffix = "step_" + str(steps) if steps % skip_steps == 0: outputs = ernie_gen.evaluate(train_resource, "train", graph_vars) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) if args.in_tokens: current_example, current_epoch = reader.get_train_progress( ) else: current_epoch = steps * args.batch_size * trainers_num // num_train_examples current_example = steps * args.batch_size * trainers_num % num_train_examples time_end = time.time() used_time = time_end - time_begin print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["ppl"], args.skip_steps / used_time)) time_begin = time.time() else: train_exe.run(fetch_list=[]) if nccl2_trainer_id >= test_dev_count: continue do_save = False do_eval = False if not args.save_and_valid_by_epoch: if steps % args.save_steps == 0 and nccl2_trainer_id == 0: do_save = True if steps % args.validation_steps == 0: do_eval = True else: if args.in_tokens: current_example, current_epoch = reader.get_train_progress( ) else: current_epoch = steps * args.batch_size * trainers_num // num_train_examples if current_epoch != last_epoch: if nccl2_trainer_id == 0: do_save = True do_eval = True if do_save: save_model(suffix=suffix) if do_eval: evaluate(suffix=suffix) last_epoch = current_epoch except fluid.core.EOFException: save_model(suffix=suffix) train_pyreader.reset() break if nccl2_trainer_id >= test_dev_count: return if args.do_val or args.do_test or args.do_pred: suffix = "output" if args.do_train: if not args.save_and_valid_by_epoch: suffix = "step_" + str(steps) else: suffix = "epoch_" + str(last_epoch) evaluate(suffix=suffix, do_pred=True)
def run_trainer(self, args): self.lr = args.lr if args.nccl2_reduce_layer_local_run: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, single_device=True) elif args.use_dgc: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc) else: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size) if args.mem_opt: fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) if args.update_method == "pserver": t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode, args.dc_asgd) trainer_prog = t.get_trainer_program() elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": # transpile for nccl2 config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" config.nccl_comm_num = args.nccl_comm_num nccl2_t = fluid.DistributeTranspiler(config=config) nccl2_t.transpile(args.trainer_id, program=fluid.default_main_program(), startup_program=fluid.default_startup_program(), trainers=args.endpoints, current_endpoint=args.current_endpoint) trainer_prog = fluid.default_main_program() else: trainer_prog = fluid.default_main_program() if args.use_cuda: device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) else: place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 exec_strategy.allow_op_delay = False build_stra = fluid.BuildStrategy() # FIXME force disable enable_inplace and memory_optimize build_stra.enable_inplace = False build_stra.memory_optimize = False if args.use_reduce: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce else: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce pass_builder = None if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass") mypass.set("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": build_stra.num_trainers = len(args.endpoints.split(",")) build_stra.trainer_id = args.trainer_id else: # case args.update_method == "nccl2_reduce_layer": build_stra.num_trainers = 1 build_stra.trainer_id = 0 binary = compiler.CompiledProgram(trainer_prog).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_stra, exec_strategy=exec_strategy) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) reader_generator = train_reader() def get_data(): origin_batch = next(reader_generator) if args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: new_batch.append(item) return new_batch else: return origin_batch out_losses = [] for _ in six.moves.xrange(RUN_STEP): loss, = exe.run(binary, fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) out_losses.append(loss[0]) if six.PY2: print(pickle.dumps(out_losses)) else: sys.stdout.buffer.write(pickle.dumps(out_losses))
def get_transpile(self, mode, trainers="127.0.0.1:6174"): config = fluid.DistributeTranspilerConfig() config.mode = 'collective' config.collective_mode = mode t = fluid.DistributeTranspiler(config=config) return t
def train(args): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_path) word2vec_reader = None if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1": word2vec_reader = reader.Word2VecReader( args.dict_path, args.train_data_path, filelist, 0, 1) else: trainer_id = int(os.environ["PADDLE_TRAINER_ID"]) trainers = int(os.environ["PADDLE_TRAINERS"]) word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_path, filelist, trainer_id, trainer_num) logger.info("dict_size: {}".format(word2vec_reader.dict_size)) loss, py_reader = skip_gram_word2vec( word2vec_reader.dict_size, word2vec_reader.word_frequencys, args.embedding_size, args.max_code_length, args.with_hs, args.with_nce, is_sparse=args.is_sparse) optimizer = None if args.with_Adam: optimizer = fluid.optimizer.Adam(learning_rate=1e-4) else: optimizer = fluid.optimizer.SGD(learning_rate=1e-4) optimizer.minimize(loss) # do local training if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1": logger.info("run local training") main_program = fluid.default_main_program() with open("local.main.proto", "w") as f: f.write(str(main_program)) train_loop(args, main_program, word2vec_reader, py_reader, loss, 0) # do distribute training else: logger.info("run dist training") trainer_id = int(os.environ["PADDLE_TRAINER_ID"]) trainers = int(os.environ["PADDLE_TRAINERS"]) training_role = os.environ["PADDLE_TRAINING_ROLE"] port = os.getenv("PADDLE_PSERVER_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port config = fluid.DistributeTranspilerConfig() config.slice_var_up = False t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=True) if training_role == "PSERVER": logger.info("run pserver") prog = t.get_pserver_program(current_endpoint) startup = t.get_startup_program( current_endpoint, pserver_program=prog) with open("pserver.main.proto.{}".format(os.getenv("CUR_PORT")), "w") as f: f.write(str(prog)) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup) exe.run(prog) elif training_role == "TRAINER": logger.info("run trainer") train_prog = t.get_trainer_program() with open("trainer.main.proto.{}".format(trainer_id), "w") as f: f.write(str(train_prog)) train_loop(args, train_prog, word2vec_reader, py_reader, loss, trainer_id)
def main(args): """Run GraphSum model.""" model_config = GraphSumConfig(args.config_path) model_config.print_config() gpu_id = 0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed: gpus = os.getenv("FLAGS_selected_gpus").split(",") gpu_id = int(gpus[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = len(gpus) if args.is_distributed else gpus else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) """load vocabulary""" spm = sentencepiece.SentencePieceProcessor() spm.Load(args.vocab_path) symbols = {'BOS': spm.PieceToId('<S>'), 'EOS': spm.PieceToId('</S>'), 'PAD': spm.PieceToId('<PAD>'), 'EOT': spm.PieceToId('<T>'), 'EOP': spm.PieceToId('<P>'), 'EOQ': spm.PieceToId('<Q>'), 'UNK': spm.PieceToId('<UNK>')} logger.info(symbols) vocab_size = len(spm) """create transformer model""" graphsum = GraphSumModel(args=args, config=model_config, padding_idx=symbols['PAD'], bos_idx=symbols['BOS'], eos_idx=symbols['EOS'], tokenizer=spm) reader = task_reader.GraphSumReader( max_para_num=args.max_para_num, max_para_len=args.max_para_len, max_tgt_len=args.max_tgt_len, in_tokens=args.in_tokens, random_seed=args.random_seed, bos_idx=symbols['BOS'], eos_idx=symbols['EOS'], pad_idx=symbols['PAD'], n_head=model_config['num_attention_heads']) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) train_data_generator = reader.data_generator_with_buffer( data_path=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=trainers_num, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_tgt_len) // trainers_num else: max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num if args.lr_scheduler == 'linear_warmup_decay': warmup_steps = int(max_train_steps * args.warmup_proportion) else: warmup_steps = args.warmup_steps logger.info("Device count: %d, gpu_id: %d" % (dev_count, gpu_id)) logger.info("Num train examples: %d" % num_train_examples) logger.info("Max train steps: %d" % max_train_steps) logger.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = graphsum.create_model( pyreader_name='train_reader') scheduled_lr, _ = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, d_model=model_config['hidden_size'], scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, grad_norm=args.grad_norm, beta1=args.beta1, beta2=args.beta2, epsilon=float(args.eps)) """ fluid.memory_optimize( input_program=train_program, skip_opt_set=[ graph_vars["loss"].name ]) """ # if args.verbose: # if args.in_tokens: # lower_mem, upper_mem, unit = fluid.contrib.memory_usage( # program=train_program, # batch_size=args.batch_size // args.max_tgt_len) # else: # lower_mem, upper_mem, unit = fluid.contrib.memory_usage( # program=train_program, batch_size=args.batch_size) # logger.info("Theoretical memory usage in training: %.3f - %.3f %s" % # (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = graphsum.create_model( pyreader_name='test_reader', is_prediction=args.do_dec) test_prog = test_prog.clone(for_test=True) print_model_params(test_prog) if args.do_dec: if not os.path.exists(args.decode_path): os.mkdir(args.decode_path) nccl2_num_trainers = 1 nccl2_trainer_id = 0 logger.info("args.is_distributed: %s" % str(args.is_distributed)) if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) logger.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: # init position_encoding enc_word_pos_emb_param = fluid.global_scope().find_var( model_config['enc_word_pos_embedding_name']).get_tensor() enc_word_pos_emb_param.set( position_encoding_init(model_config['max_position_embeddings'], model_config['hidden_size'] // 2), place) enc_sent_pos_emb_param = fluid.global_scope().find_var( model_config['enc_sen_pos_embedding_name']).get_tensor() enc_sent_pos_emb_param.set( position_encoding_init(model_config['max_position_embeddings'], model_config['hidden_size'] // 2), place) dec_word_pos_emb_param = fluid.global_scope().find_var( model_config['dec_word_pos_embedding_name']).get_tensor() dec_word_pos_emb_param.set( position_encoding_init(model_config['max_position_embeddings'], model_config['hidden_size']), place) if args.init_checkpoint and args.init_pretraining_params: logger.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_test: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() skip_steps = args.skip_steps while True: try: steps += 1 if steps % skip_steps == 0: outputs = evaluate(args=args, exe=train_exe, program=train_program, pyreader=train_pyreader, graph_vars=graph_vars, eval_phase="train", vocab_size=vocab_size) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size() verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) logger.info(verbose) current_example, current_epoch = reader.get_train_progress() time_end = time.time() used_time = time_end - time_begin logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, acc: %f, learning rate: %.8f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["ppl"], outputs["acc"], outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate, args.skip_steps / used_time)) time_begin = time.time() else: train_exe.run(fetch_list=[]) if nccl2_trainer_id == 0: if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase='dev', do_dec=args.do_dec, place=place)) evaluate(args=args, exe=test_exe, program=test_prog, pyreader=test_pyreader, graph_vars=test_graph_vars, eval_phase="dev", vocab_size=vocab_size, do_dec=args.do_dec, vocab_path=args.vocab_path, features=reader.get_features("dev"), decode_path=args.decode_path + "/valid_" + str(steps) + "_preds") # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase='test', do_dec=args.do_dec, place=place)) evaluate(args=args, exe=test_exe, program=test_prog, pyreader=test_pyreader, graph_vars=test_graph_vars, eval_phase="test", vocab_size=vocab_size, do_dec=args.do_dec, vocab_path=args.vocab_path, features=reader.get_features("test"), decode_path=args.decode_path + "/test_" + str(steps) + "_preds") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if nccl2_trainer_id == 0: # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase='dev', do_dec=args.do_dec, place=place)) logger.info("Final validation result:") evaluate(args=args, exe=test_exe, program=test_prog, pyreader=test_pyreader, graph_vars=test_graph_vars, eval_phase="dev", vocab_size=vocab_size, do_dec=args.do_dec, vocab_path=args.vocab_path, features=reader.get_features("dev"), decode_path=args.decode_path + "/valid_final_preds") # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase='test', do_dec=args.do_dec, place=place)) logger.info("Final test result:") evaluate(args=args, exe=test_exe, program=test_prog, pyreader=test_pyreader, graph_vars=test_graph_vars, eval_phase="test", vocab_size=vocab_size, do_dec=args.do_dec, vocab_path=args.vocab_path, features=reader.get_features("test"), decode_path=args.decode_path + "/test_final_preds")
def train(args): print("pretraining start") ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization(loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) print("theoretical memory usage: ") if args.in_tokens: print( fluid.contrib.memory_usage(program=train_program, batch_size=args.batch_size // args.max_seq_len)) else: print( fluid.contrib.memory_usage(program=train_program, batch_size=args.batch_size)) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = ErnieDataReader(filelist=args.train_filelist, batch_size=args.batch_size, vocab_path=args.vocab_path, voc_size=ernie_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample, in_tokens=args.in_tokens, is_bidirection=args.is_bidirection) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps) build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=total_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) if args.valid_filelist and args.valid_filelist != "": predict = predict_wrapper(args, exe, ernie_config, test_prog=test_prog, pyreader=test_pyreader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_pyreader.decorate_tensor_provider(data_reader.data_generator()) train_pyreader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += nccl2_num_trainers skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: train_exe.run(fetch_list=[]) continue if steps % skip_steps != 0: train_exe.run(fetch_list=[]) else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ]) acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) print("feed_queue size", train_pyreader.queue.size()) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file, mask_type)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if args.valid_filelist and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_pyreader.reset() break
def train(args): print("pretraining start") bert_config = BertConfig(args.bert_config_path) bert_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = DataReader(data_dir=args.data_dir, batch_size=args.batch_size, in_tokens=args.in_tokens, vocab_path=args.vocab_path, voc_size=bert_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if not sys.platform == "win32": build_strategy.num_trainers = nccl2_num_trainers elif nccl2_num_trainers > 1: raise ValueError( "Windows platform doesn't support distributed training!") build_strategy.trainer_id = nccl2_trainer_id # use_ngraph is for CPU only, please refer to README_ngraph.md for details use_ngraph = os.getenv('FLAGS_use_ngraph') if not use_ngraph: train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=total_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) if args.validation_set_dir and args.validation_set_dir != "": predict = predict_wrapper(args, exe, bert_config, test_prog=test_prog, data_loader=test_data_loader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_data_loader.set_batch_generator(data_reader.data_generator()) train_data_loader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += 1 skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) continue if steps % args.skip_steps != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) else: fetch_list = [ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ] if args.use_fp16: fetch_list.append(loss_scaling.name) if use_ngraph: outputs = exe.run(fetch_list=fetch_list, program=train_program) else: outputs = exe.run(fetch_list=fetch_list, program=train_compiled_program) if args.use_fp16: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr, np_scaling = outputs else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = outputs acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file = data_reader.get_progress( ) if args.verbose: verbose = "feed_queue size: %d, " % train_data_loader.queue.size( ) verbose += "current learning_rate: %f, " % np_lr[0] if args.use_fp16: verbose += "loss scaling: %f" % np_scaling[0] print(verbose) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if args.validation_set_dir and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_data_loader.reset() break
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.ClassifyReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=dev_count, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() if args.random_seed is not None and args.enable_ce: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_test: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 current_epoch = 0 while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate( train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric, is_classify=args.is_classify, is_regression=args.is_regression) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress() time_end = time.time() used_time = time_end - time_begin if args.is_classify: log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) if args.is_regression: log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " " speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], args.skip_steps / used_time)) time_begin = time.time() if nccl2_trainer_id == 0: if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0 or last_epoch != current_epoch: # evaluate dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.enable_ce: card_num = get_cards() ce_loss = 0 ce_acc = 0 ce_time = 0 try: ce_loss = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: log.info("ce info error") log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time)) log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss)) log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc)) # final eval on dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on test set if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator( args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) log.info("Final diagnostic") qids, preds, probs = predict( test_exe, test_prog, test_pyreader, graph_vars, is_classify=args.is_classify, is_regression=args.is_regression) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds)) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) log.info("Done final diagnostic, saving to {}".format( args.diagnostic_save))
def main(args): """main func""" unimo_config = UNIMOConfig(args.unimo_config_path) if args.hidden_dropout_prob >= 0: unimo_config["hidden_dropout_prob"] = args.hidden_dropout_prob if args.attention_probs_dropout_prob >= 0: unimo_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob unimo_config.print_config() if args.pred_batch_size <= 0: args.pred_batch_size = args.batch_size gpu_id = 0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None: gpu_list = os.getenv("FLAGS_selected_gpus").split(",") gpus = len(gpu_list) gpu_id = int(gpu_list[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = gpus else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) """load vocabulary""" tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file, encoder_json_file=args.encoder_json_file, vocab_bpe_file=args.vocab_bpe_file, do_lower_case=True) reader = Img2TxtReader(tokenizer, args) img2txt = Img2Txt(args, unimo_config, tokenizer) if not (args.do_train or args.do_val or args.do_test or args.do_pred): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) train_data_generator = reader.data_generator( filelist=args.train_filelist, batch_size=args.batch_size, epoch=args.epoch, dev_count=trainers_num, shuffle=True, phase="train") num_train_examples = reader.get_num_examples( args.train_filelist) # 566747 max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id)) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): print("using adv_type is ", args.adv_type) if args.adv_type == "freelb_text": train_pyreader, graph_vars = img2txt.create_model_freelb_text( ) elif args.adv_type == "freelb_image": train_pyreader, graph_vars = img2txt.create_model_freelb_image( ) elif args.adv_type == "villa": train_pyreader, graph_vars = img2txt.create_model_villa() else: print( "Unsupported adv_type, run model without adversial training" ) train_pyreader, graph_vars = img2txt.create_model() scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon) if args.do_val or args.do_test or args.do_pred: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = img2txt.create_model( decoding=args.do_decode) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" if args.nccl_comm_num > 1: config.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks: config.use_hierarchical_allreduce = args.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks assert config.hierarchical_allreduce_inter_nranks > 1 assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0 config.hierarchical_allreduce_exter_nranks = \ trainers_num / config.hierarchical_allreduce_inter_nranks t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) init_model(args, exe, train_program if args.do_train else test_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 if args.use_fp16 else 2 # 2 for fp32 4 for fp16 exec_strategy.num_iteration_per_drop_scope = min( args.num_iteration_per_drop_scope, args.skip_steps) build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False if args.use_fuse: build_strategy.fuse_all_reduce_ops = True train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.set_batch_generator(train_data_generator) train_resource = { "exe": train_exe, "program": train_program, "pyreader": train_pyreader } save_model = partial(save_checkpoint, program=train_program, exe=exe) test_dev_count = 1 if args.do_val or args.do_test or args.do_pred: test_exe = exe if args.use_multi_gpu_test: test_dev_count = nccl2_num_trainers test_resource = { "exe": test_exe, "program": test_prog, "pyreader": test_pyreader } eval_data_generator = partial(reader.data_generator, batch_size=args.pred_batch_size, epoch=1, dev_count=test_dev_count, shuffle=False, do_decode=args.do_decode, place=place) eval_func = partial(img2txt.evaluate, resource=test_resource, graph_vars=test_graph_vars, dev_count=test_dev_count, output_path=args.checkpoints, gpu_id=nccl2_trainer_id) evaluate = partial(evaluate_datasets, pyreader=test_pyreader, reader=reader, eval_func=eval_func, data_generator=eval_data_generator) if args.do_train: train_pyreader.start() steps = 0 last_epoch = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() skip_steps = args.skip_steps while True: try: steps += 1 if args.save_and_valid_by_epoch: suffix = "epoch_" + str(last_epoch) else: suffix = "step_" + str(steps) if steps % skip_steps == 0: outputs = img2txt.evaluate(train_resource, "train", graph_vars) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %.8f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_epoch = steps * args.batch_size * trainers_num // num_train_examples current_example = steps * args.batch_size * trainers_num % num_train_examples time_end = time.time() used_time = time_end - time_begin print( "%s - epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, speed: %f steps/s" % (get_time(), current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["ppl"], args.skip_steps / used_time)) time_begin = time.time() if args.visualdl_log and nccl2_trainer_id == 0: visuallog_dict = OrderedDict() visuallog_dict["ppl"] = outputs["ppl"] visualdl_log(visuallog_dict, outputs["ppl"], steps, phase='train') else: train_exe.run(fetch_list=[]) if nccl2_trainer_id >= test_dev_count: continue do_save = False do_eval = False if not args.save_and_valid_by_epoch: if steps % args.save_steps == 0 and nccl2_trainer_id == 0: do_save = True if steps % args.validation_steps == 0: do_eval = True else: current_epoch = steps * args.batch_size * trainers_num // num_train_examples if current_epoch != last_epoch: if nccl2_trainer_id == 0: do_save = True do_eval = True if do_save: save_model(suffix=suffix) if do_eval: if args.do_val or args.do_test or args.do_pred: evaluate(suffix=suffix) if args.save_and_valid_by_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_model(suffix=suffix) train_pyreader.reset() break if nccl2_trainer_id >= test_dev_count: return if args.do_val or args.do_test or args.do_pred: suffix = "output" if args.do_train: if not args.save_and_valid_by_epoch: suffix = "step_" + str(steps) else: suffix = "epoch_" + str(last_epoch) evaluate(suffix=suffix, do_pred=True)