def test_recompute_checkpoints(self): optimizer = fluid.optimizer.AdamOptimizer() dist_strategy = DistributedStrategy() dist_strategy.forward_recompute = True dist_strategy.recompute_checkpoints = "NoneListTest" self.assertRaises(ValueError, CollectiveOptimizer, optimizer, dist_strategy) dist_strategy.recompute_checkpoints = [] dist_optimizer = CollectiveOptimizer(optimizer, dist_strategy) self.assertRaises(ValueError, dist_optimizer.minimize, None)
def train(args): print("pretraining start") ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() with open(args.task_group_json) as f: task_group = json.load(f) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 if args.use_amp else 2 exec_strategy.num_iteration_per_drop_scope = min(1, args.skip_steps) node_nums = int(os.getenv("PADDLE_NODES_NUM")) print("args.is_distributed:", args.is_distributed) num_trainers = 1 trainer_id = 0 if args.is_distributed: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) trainer_id = fleet.worker_index() current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = fleet.worker_endpoints() trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}" .format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.remove_unnecessary_lock = False # not useful dist_strategy.fuse_all_reduce_ops = True if args.use_fuse else False dist_strategy.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce \ and trainers_num > args.hierarchical_allreduce_inter_nranks: dist_strategy.use_hierarchical_allreduce = args.use_hierarchical_allreduce dist_strategy.hierarchical_allreduce_inter_nranks = \ args.hierarchical_allreduce_inter_nranks assert dist_strategy.use_hierarchical_allreduce > 1 assert trainers_num % dist_strategy.hierarchical_allreduce_inter_nranks == 0 dist_strategy.hierarchical_allreduce_exter_nranks = \ trainers_num / dist_strategy.hierarchical_allreduce_inter_nranks if args.use_amp: dist_strategy.use_amp = True dist_strategy.amp_loss_scaling = args.init_loss_scaling if args.use_recompute: dist_strategy.forward_recompute = True dist_strategy.enable_sequential_execution=True trainer_id = fleet.worker_index() current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = fleet.worker_endpoints() trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}" .format(worker_endpoints,trainers_num, current_endpoint, trainer_id)) else: dist_strategy=None gpu_id=0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed: gpus = os.getenv("FLAGS_selected_gpus").split(",") gpu_id = int(gpus[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = len(gpus) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d, gpu_id:%d" % (dev_count, gpu_id)) train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, fetch_vars = create_model( pyreader_name='train_reader', ernie_config=ernie_config, task_group=task_group) graph_vars = fetch_vars["graph_vars"] checkpoints = fetch_vars["checkpoints"] total_loss = graph_vars[-1] if args.use_recompute: dist_strategy.recompute_checkpoints = checkpoints scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_amp, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, dist_strategy=dist_strategy) origin_train_program = train_program if args.is_distributed: #raped by fleet, need to assign fleet's modified train_grogram back train_program = fleet.main_program origin_train_program = fleet._origin_program test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, fetch_vars = create_model( pyreader_name='test_reader', ernie_config=ernie_config, task_group=task_group) graph_vars = fetch_vars["graph_vars"] total_loss = graph_vars[-1] test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": #init_checkpoint(exe, args.init_checkpoint, origin_train_program, args.use_amp) init_pretraining_params(exe, args.init_checkpoint, origin_train_program, args.use_amp) data_reader = ErnieDataReader( task_group, False, batch_size=args.batch_size, vocab_path=args.vocab_path, voc_size=ernie_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample, hack_old_trainset=args.hack_old_data) #only fleet train_exe = exe predict = predict_wrapper( args, exe, ernie_config, task_group, test_prog=test_prog, pyreader=test_pyreader, fetch_list=[var.name for var in graph_vars]) train_pyreader.set_batch_generator(data_reader.data_generator()) train_pyreader.start() steps = 112000 time_begin = time.time() node_nums = int(os.getenv("PADDLE_NODES_NUM")) while True:#steps < args.num_train_steps: try: steps += 1#node_nums skip_steps = args.skip_steps# * node_nums fetch_list = [] if trainer_id == 0 and steps % skip_steps == 0: fetch_list = [var.name for var in graph_vars] + [scheduled_lr.name] if args.use_amp: fetch_list.append(loss_scaling.name) outputs = train_exe.run(fetch_list=fetch_list, program=train_program) time_end = time.time() used_time = time_end - time_begin if outputs: each_mask_lm_cost, lm_w = outputs[:2] if args.use_amp: each_total_constract_loss, each_total_cost, np_lr, l_scaling = outputs[-4:] else: each_total_constract_loss, each_total_cost, np_lr = outputs[-3:] acc_list =[] index = 2 for task in task_group: each_task_acc = outputs[index] task_w = outputs[index + 1] acc = np.sum(each_task_acc * task_w) / np.sum(task_w) acc_list.append("%s acc: %f" % (task["task_name"], acc)) index += 2 print("feed_queue size", train_pyreader.queue.size()) epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress() if args.use_amp: print("current learning_rate:%f, loss scaling:%f" % (np_lr[0], l_scaling[0])) else: print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, constract_loss: %f, loss: %f, " "ppl: %f, %s, speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, np.mean(each_total_constract_loss), np.mean(each_total_cost), np.exp(np.sum(each_mask_lm_cost * lm_w) / np.sum(lm_w)), ", ".join(acc_list), skip_steps / used_time, current_file, mask_type)) time_begin = time.time() elif steps % skip_steps == 0: epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("feed_queue size", train_pyreader.queue.size()) print("epoch: %d, progress: %d/%d, step: %d, " "speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, skip_steps / used_time, current_file, mask_type)) time_begin = time.time() if not trainer_id == 0: continue if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, origin_train_program) if steps % args.validation_steps == 0: valid_list = predict() print("[validation_set] epoch: %d, step: %d, %s" % \ (epoch, steps, ", ".join(valid_list))) except fluid.core.EOFException: train_pyreader.reset() break
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = reader_de.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, q_max_seq_len=args.q_max_seq_len, p_max_seq_len=args.p_max_seq_len, total_num=args.train_data_size, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size == None: args.predict_batch_size = args.batch_size if args.do_train: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fleet.worker_num() train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=1, trainer_id=fleet.worker_index(), trainer_num=fleet.worker_num(), shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() # use fleet api exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count if args.is_distributed: exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 1 if args.is_distributed: dist_strategy.nccl_comm_num = 2 dist_strategy.use_hierarchical_allreduce = True if args.use_recompute: dist_strategy.forward_recompute = True dist_strategy.enable_sequential_execution = True if args.use_mix_precision: dist_strategy.use_amp = True with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars, checkpoints = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, batch_size=args.batch_size, fleet_handle=fleet) if args.use_recompute: dist_strategy.recompute_checkpoints = checkpoints scheduled_lr = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, dist_strategy=dist_strategy, use_lamb=args.use_lamb) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, batch_size=args.predict_batch_size, is_prediction=True) test_prog = test_prog.clone(for_test=True) train_program = fleet.main_program exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: train_exe = exe train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 current_epoch = 0 total_loss = [] while True: try: steps += 1 if fleet.worker_index() != 0: train_exe.run(fetch_list=[], program=train_program) continue if steps % args.skip_steps != 0: train_exe.run(fetch_list=[], program=train_program) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin total_loss.append(outputs["loss"]) log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example * dev_count, num_train_examples, steps, np.mean(total_loss), outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) # if steps % args.validation_steps == 0 or last_epoch != current_epoch: if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, test_graph_vars, current_epoch, steps) if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, test_graph_vars, current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) train_pyreader.reset() break # final eval on dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, test_graph_vars, current_epoch, steps) # final eval on test set if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, test_graph_vars, current_epoch, steps)