def init_predict_checkpoint(args, exe, startup_prog): if args.dataset in ["pathQueryWN", "pathQueryFB"]: assert args.sen_candli_file is not None and args.sen_trivial_file is not None, "during test, pathQuery sen_candli_file and path_trivial_file must be set " if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16)
def convert(args): tf_fluid_param_name_map, tf_param_name_shape_map = parse( args.init_tf_checkpoint) program = fluid.Program() global_block = program.global_block() for param in tf_fluid_param_name_map: global_block.create_parameter( name=tf_fluid_param_name_map[param], shape=tf_param_name_shape_map[param], dtype='float32', initializer=fluid.initializer.Constant(value=0.0)) place = fluid.core.CPUPlace() exe = fluid.Executor(place) exe.run(program) #load the fluid model init_checkpoint(exe, "params", program) print( '---------------------- Converted Parameters -----------------------') print( '###### [Fluid param name] --> [TF param name] [param shape] ######') print( '-------------------------------------------------------------------') nodes_value = [] var_list = [] for param in tf_fluid_param_name_map: init_value = fluid.global_scope().find_var( tf_fluid_param_name_map[param]).get_tensor() init_value = np.array(init_value) if param == 'cls/seq_relationship/output_weights': init_value = np.transpose(init_value) if param == 'cls/squad/output_weights': init_value = np.transpose(init_value) fluid_shape = init_value.shape # tf.Variable(init_value,name=param) n = tf.get_variable(name=param, dtype=tf.float32, shape=fluid_shape) var_list.append(n) nodes_value.append((n, init_value)) print(tf_fluid_param_name_map[param], ' --> ', param, ' ', fluid_shape) saver = tf.train.Saver(var_list=var_list) sess = tf.Session() sess.run(tf.global_variables_initializer()) ops = [] for n, v in nodes_value: ops.append(tf.assign(n, v)) sess.run(ops) saver.save(sess, "erine.ckpt", write_meta_graph=False, write_state=False)
def predict_trigger(trigger_dict): """ trigger predict """ log = trigger_dict['log'] args = trigger_dict['args'] labels_map = trigger_dict['labels_map'] ernie_config = trigger_dict['ernie_config'] place = trigger_dict['place'] dev_count = trigger_dict['dev_count'] reader = trigger_dict['reader'] startup_prog = trigger_dict['startup_prog'] test_prog = trigger_dict['test_prog'] test_pyreader = trigger_dict['test_pyreader'] graph_vars = trigger_dict['graph_vars'] nccl2_num_trainers = trigger_dict['nccl2_num_trainers'] nccl2_trainer_id = trigger_dict['nccl2_trainer_id'] if 'exe' in trigger_dict: exe = trigger_dict['exe'] else: exe = None trigger_pred_save_path = args.trigger_pred_save_path if os.path.exists(trigger_pred_save_path): print('delete old file : %s' % trigger_pred_save_path) os.remove(trigger_pred_save_path) if args.do_test: # TODO 2020-11-24 # TODO 2021-01-20 取消if not exe -- use_fp16=args.use_fp16) del exe if not exe: exe = fluid.Executor(place) exe.run(startup_prog) if args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) log.error( "********************** trigger predict begin **********************" ) test_ret = predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') utils.write_by_lines(args.trigger_pred_save_path, test_ret) del exe
def init_train_checkpoint(args, exe, startup_prog): if args.init_checkpoint and args.init_pretraining_params: logger.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16, print_var_verbose=False) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16)
def re_use_exe(tri_dict): startup_prog = tri_dict["startup_prog"] args = tri_dict["args"] place = tri_dict["place"] exe = fluid.Executor(place) exe.run(startup_prog) if args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) return exe
def re_use_some(tri_dict, suf): ernie_config = tri_dict["ernie_config"] startup_prog = tri_dict["startup_prog"] args = tri_dict["args"] labels_map = tri_dict["labels_map"] place = tri_dict["place"] reader = task_reader.RoleSequenceLabelReader( vocab_path=args.vocab_path, labels_map=labels_map, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): # TODO pyreader_name 再次调整为不同 test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader_role' + suf, ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe = fluid.Executor(place) exe.run(startup_prog) if args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) return reader, startup_prog, test_prog, test_pyreader, graph_vars, exe
def main(args): if not (args.do_train or args.do_eval or args.do_predict): raise ValueError("For args `do_train`, `do_eval` and `do_predict`, at " "least one of them must be True.") if args.do_predict and not args.predict_dir: raise ValueError("args 'predict_dir' should be given when doing predict") if not os.path.exists(args.predict_dir): os.makedirs(args.predict_dir) xlnet_config = XLNetConfig(args.model_config_path) xlnet_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = get_device_num() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { "mnli_matched": reader.MnliMatchedProcessor, "mnli_mismatched": reader.MnliMismatchedProcessor, 'sts-b': reader.StsbProcessor, 'imdb': reader.ImdbProcessor, "yelp5": reader.Yelp5Processor } processor = processors[task_name](args) label_list = processor.get_labels() if not args.is_regression else None num_labels = len(label_list) if label_list is not None else None train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed if args.do_train: # NOTE: If num_trainers > 1, the shuffle_seed must be set, because # the order of batch data generated by reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_data_generator = processor.data_generator( batch_size=args.train_batch_size, is_regression=args.is_regression, phase='train', epoch=args.epoch, dev_count=dev_count, shuffle=args.shuffle) num_train_examples = processor.get_num_examples(phase='train') print("Device count: %d" % dev_count) print("Max num of epoches: %d" % args.epoch) print("Num of train examples: %d" % num_train_examples) print("Num of train steps: %d" % args.train_steps) print("Num of warmup steps: %d" % args.warmup_steps) with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, logits, num_seqs, label_ids = create_model( args, xlnet_config=xlnet_config, n_class=num_labels) scheduled_lr = optimization( loss=loss, warmup_steps=args.warmup_steps, num_train_steps=args.train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, lr_layer_decay_rate=args.lr_layer_decay_rate, scheduler=args.lr_scheduler) if args.do_eval: dev_prog = fluid.Program() with fluid.program_guard(dev_prog, startup_prog): with fluid.unique_name.guard(): dev_data_loader, loss, logits, num_seqs, label_ids = create_model( args, xlnet_config=xlnet_config, n_class=num_labels) dev_prog = dev_prog.clone(for_test=True) dev_data_loader.set_batch_generator( processor.data_generator( batch_size=args.eval_batch_size, is_regression=args.is_regression, phase=args.eval_split, epoch=1, dev_count=1, shuffle=False), place) if args.do_predict: predict_prog = fluid.Program() with fluid.program_guard(predict_prog, startup_prog): with fluid.unique_name.guard(): predict_data_loader, loss, logits, num_seqs, label_ids = create_model( args, xlnet_config=xlnet_config, n_class=num_labels) predict_prog = predict_prog.clone(for_test=True) predict_data_loader.set_batch_generator( processor.data_generator( batch_size=args.predict_batch_size, is_regression=args.is_regression, phase=args.eval_split, epoch=1, dev_count=1, shuffle=False), place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_eval or args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count build_strategy = fluid.BuildStrategy() if args.use_cuda and num_trainers > 1: assert shuffle_seed is not None dist_utils.prepare_for_multi_process(exe, build_strategy, train_program) train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator) train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) train_data_loader.set_batch_generator(train_data_generator, place) if args.do_train: train_data_loader.start() steps = 0 total_cost, total_num_seqs, total_time = [], [], 0.0 throughput = [] ce_info = [] while steps < args.train_steps: try: time_begin = time.time() steps += 1 if steps % args.skip_steps == 0: fetch_list = [loss.name, scheduled_lr.name, num_seqs.name] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) time_end = time.time() used_time = time_end - time_begin total_time += used_time if steps % args.skip_steps == 0: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f" % np_lr[0] print(verbose) current_example, current_epoch = processor.get_train_progress( ) log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}".format( current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs)) ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), used_time]) if steps > 0 : throughput.append( args.skip_steps / total_time) log_record = log_record + ", speed: %f steps/s" % (args.skip_steps / total_time) print(log_record) else: print(log_record) total_cost, total_num_seqs, total_time = [], [], 0.0 if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: print("Average throughtput: %s" % (np.average(throughput))) throughput = [] # evaluate dev set if args.do_eval: evaluate(exe, dev_prog, dev_data_loader, [loss.name, num_seqs.name, logits.name, label_ids.name], args.eval_split, processor.get_num_examples(phase=args.eval_split)) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_data_loader.reset() break if args.enable_ce: card_num = get_cards() ce_cost = 0 ce_time = 0 try: ce_cost = ce_info[-2][0] ce_time = ce_info[-2][1] except: print("ce info error") print("kpis\ttrain_duration_%s_card%s\t%s" % (args.task_name.replace("-", "_"), card_num, ce_time)) print("kpis\ttrain_cost_%s_card%s\t%f" % (args.task_name.replace("-", "_"), card_num, ce_cost)) # final eval on dev set if args.do_eval: evaluate(exe, dev_prog, dev_data_loader, [loss.name, num_seqs.name, logits.name, label_ids], args.eval_split, processor.get_num_examples(phase=args.eval_split)) # final eval on test set if args.do_predict: predict(exe, predict_prog, predict_data_loader, task_name, label_list, [logits.name])
def train(multitask_config): # load task config print("Loading multi_task configure...................") args = PDConfig(yaml_file=[multitask_config]) args.build() index = 0 reader_map_task = dict() task_args_list = list() reader_args_list = list() id_map_task = {index: args.main_task} print("Loading main task configure....................") main_task_name = args.main_task task_config_files = [ i for i in os.listdir(TASKSET_PATH) if i.endswith('.yaml') ] main_config_list = [ config for config in task_config_files if config.split('.')[0] == main_task_name ] main_args = None for config in main_config_list: main_yaml = os.path.join(TASKSET_PATH, config) main_args = PDConfig(yaml_file=[multitask_config, main_yaml]) main_args.build() main_args.Print() if not task_args_list or main_task_name != task_args_list[-1][0]: task_args_list.append((main_task_name, main_args)) reader_args_list.append((config.strip('.yaml'), main_args)) reader_map_task[config.strip('.yaml')] = main_task_name print("Loading auxiliary tasks configure...................") aux_task_name_list = args.auxiliary_task.strip().split() for aux_task_name in aux_task_name_list: index += 1 id_map_task[index] = aux_task_name print("Loading %s auxiliary tasks configure......." % aux_task_name) aux_config_list = [ config for config in task_config_files if config.split('.')[0] == aux_task_name ] for aux_yaml in aux_config_list: aux_yaml = os.path.join(TASKSET_PATH, aux_yaml) aux_args = PDConfig(yaml_file=[multitask_config, aux_yaml]) aux_args.build() aux_args.Print() if aux_task_name != task_args_list[-1][0]: task_args_list.append((aux_task_name, aux_args)) reader_args_list.append((aux_yaml.strip('.yaml'), aux_args)) reader_map_task[aux_yaml.strip('.yaml')] = aux_task_name # import tasks reader module and build joint_input_shape input_shape_list = [] reader_module_dict = {} input_shape_dict = {} for params in task_args_list: task_reader_mdl = "%s_reader" % params[0] reader_module = importlib.import_module(task_reader_mdl) reader_servlet_cls = getattr(reader_module, "get_input_shape") reader_input_shape = reader_servlet_cls(params[1]) reader_module_dict[params[0]] = reader_module input_shape_list.append(reader_input_shape) input_shape_dict[params[0]] = reader_input_shape train_input_shape, test_input_shape, task_map_id = joint_reader.joint_input_shape( input_shape_list) # import backbone model backbone_mdl = args.backbone_model backbone_cls = "Model" backbone_module = importlib.import_module(backbone_mdl) backbone_servlet = getattr(backbone_module, backbone_cls) if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) startup_prog = fluid.default_startup_program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: #create joint pyreader print('creating readers...') gens = [] main_generator = "" for params in reader_args_list: generator_cls = getattr( reader_module_dict[reader_map_task[params[0]]], "DataProcessor") generator_inst = generator_cls(params[1]) reader_generator = generator_inst.data_generator( phase='train', shuffle=True, dev_count=dev_count) if not main_generator: main_generator = generator_inst gens.append((reader_generator, params[1].mix_ratio, reader_map_task[params[0]])) joint_generator, train_pyreader, model_inputs = create_reader( "train_reader", train_input_shape, True, task_map_id, gens) train_pyreader.decorate_tensor_provider(joint_generator) # build task inputs task_inputs_list = [] main_test_input = [] task_id = model_inputs[0] backbone_inputs = model_inputs[task_map_id[0][0]:task_map_id[0][1]] for i in range(1, len(task_map_id)): task_inputs = backbone_inputs + model_inputs[ task_map_id[i][0]:task_map_id[i][1]] task_inputs_list.append(task_inputs) # build backbone model print('building model backbone...') conf = vars(args) if args.pretrain_config_path is not None: model_conf = JsonConfig(args.pretrain_config_path).asdict() for k, v in model_conf.items(): if k in conf: assert k == conf[ k], "ERROR: argument {} in pretrain_model_config is NOT consistent with which in main.yaml" conf.update(model_conf) backbone_inst = backbone_servlet(conf, is_training=True) print('building task models...') num_train_examples = main_generator.get_num_examples() if main_args.in_tokens: max_train_steps = int(main_args.epoch * num_train_examples) // ( main_args.batch_size // main_args.max_seq_len) // dev_count else: max_train_steps = int(main_args.epoch * num_train_examples) // ( main_args.batch_size) // dev_count mix_ratio_list = [ task_args[1].mix_ratio for task_args in task_args_list ] args.max_train_steps = int(max_train_steps * (sum(mix_ratio_list) / main_args.mix_ratio)) print("Max train steps: %d" % max_train_steps) build_strategy = fluid.BuildStrategy() train_program = fluid.default_main_program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): backbone_inst.build_model(backbone_inputs) all_loss_list = [] for i in range(len(task_args_list)): task_name = task_args_list[i][0] task_args = task_args_list[i][1] if hasattr(task_args, 'paradigm'): task_net = task_args.paradigm else: task_net = task_name task_net_mdl = importlib.import_module(task_net) task_net_cls = getattr(task_net_mdl, "create_model") output_tensor = task_net_cls(task_inputs_list[i], base_model=backbone_inst, is_training=True, args=task_args) loss_cls = getattr(task_net_mdl, "compute_loss") task_loss = loss_cls(output_tensor, task_args) all_loss_list.append(task_loss) num_seqs = output_tensor['num_seqs'] task_one_hot = fluid.layers.one_hot(task_id, len(task_args_list)) all_loss = fluid.layers.concat(all_loss_list, axis=0) loss = fluid.layers.reduce_sum(task_one_hot * all_loss) programs = [train_program, startup_prog] optimizer_mdl = importlib.import_module(args.optimizer) optimizer_inst = getattr(optimizer_mdl, "optimization") optimizer_inst(loss, programs, args=args) loss.persistable = True num_seqs.persistable = True ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay) ema.update() train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy) if args.do_predict: conf = vars(args) if args.pretrain_config_path is not None: model_conf = JsonConfig(args.pretrain_config_path).asdict() for k, v in model_conf.items(): if k in conf: assert v == conf[ k], "ERROR: argument {} in pretrain_model_config is NOT consistent with which in main.yaml".format( k) conf.update(model_conf) mod = reader_module_dict[main_task_name] DataProcessor = getattr(mod, 'DataProcessor') predict_processor = DataProcessor(main_args) test_generator = predict_processor.data_generator(phase='predict', shuffle=False, dev_count=dev_count) new_test_input_shape = input_shape_dict[main_task_name][1][ 'backbone'] + input_shape_dict[main_task_name][1]['task'] assert new_test_input_shape == test_input_shape build_strategy = fluid.BuildStrategy() test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): placeholder = Placeholder(test_input_shape) test_pyreader, model_inputs = placeholder.build( capacity=100, reader_name="test_reader") test_pyreader.decorate_tensor_provider(test_generator) # create model backbone_inst = backbone_servlet(conf, is_training=False) backbone_inst.build_model(model_inputs) task_net_mdl = importlib.import_module(main_task_name) task_net_cls = getattr(task_net_mdl, "create_model") postprocess = getattr(task_net_mdl, "postprocess") global_postprocess = getattr(task_net_mdl, "global_postprocess") output_tensor = task_net_cls(model_inputs, base_model=backbone_inst, is_training=False, args=main_args) if 'ema' not in dir(): ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) pred_fetch_names = [] fetch_vars = [] for i, j in output_tensor.items(): pred_fetch_names.append(i) fetch_vars.append(j) for var in fetch_vars: var.persistable = True pred_fetch_list = [i.name for i in fetch_vars] test_prog = test_prog.clone(for_test=True) test_compiled_program = fluid.CompiledProgram( test_prog).with_data_parallel(build_strategy=build_strategy) exe.run(startup_prog) if args.do_train: if args.pretrain_model_path: init_pretraining_params(exe, args.pretrain_model_path, main_program=startup_prog, use_fp16=args.use_fp16) if args.checkpoint_path: if os.path.exists(args.checkpoint_path): init_checkpoint(exe, args.checkpoint_path, main_program=startup_prog, use_fp16=args.use_fp16) else: os.makedirs(args.checkpoint_path) elif args.do_predict: if not args.checkpoint_path: raise ValueError("args 'checkpoint_path' should be set if" "only doing prediction!") init_checkpoint(exe, args.checkpoint_path, main_program=test_prog, use_fp16=args.use_fp16) if args.do_train: print('start training...') train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: fetch_list = [loss.name, num_seqs.name, task_id.name] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: np_loss, np_num_seqs, np_task_id = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) time_end = time.time() used_time = time_end - time_begin current_example, epoch = main_generator.get_train_progress( ) cur_task_name = id_map_task[np_task_id[0][0]] print( "epoch: %d, task_name: %s, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, cur_task_name, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoint_path, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps == max_train_steps: save_path = os.path.join(args.checkpoint_path, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) break except paddle.fluid.core.EOFException as err: save_path = os.path.join(args.checkpoint_path, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict: print('start predicting...') cnt = 0 if args.use_ema: with ema.apply(exe): test_pyreader.start() pred_buf = [] while True: try: fetch_res = exe.run(fetch_list=pred_fetch_list, program=test_compiled_program) cnt += 1 if cnt % 200 == 0: print('predicting {}th batch...'.format(cnt)) fetch_dict = {} for key, val in zip(pred_fetch_names, fetch_res): fetch_dict[key] = val res = postprocess(fetch_dict) if res is not None: pred_buf.extend(res) except fluid.core.EOFException: test_pyreader.reset() break global_postprocess(pred_buf, predict_processor, args, main_args) else: test_pyreader.start() pred_buf = [] while True: try: fetch_res = exe.run(fetch_list=pred_fetch_list, program=test_compiled_program) cnt += 1 if cnt % 200 == 0: print('predicting {}th batch...'.format(cnt)) fetch_dict = {} for key, val in zip(pred_fetch_names, fetch_res): fetch_dict[key] = val res = postprocess(fetch_dict) if res is not None: pred_buf.extend(res) except fluid.core.EOFException: test_pyreader.reset() break global_postprocess(pred_buf, predict_processor, args, main_args)
ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): _, graph_vars = create_model_predict(args, ernie_config=ernie_config, is_prediction=True) LABELMODE = [str(x) for x in range(args.num_labels - 1)] test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) reader = task_reader.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) #print(datetime.datetime.now()) def rm_space(line): line = re.sub(ur'(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])', '', line) return line
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) reader = task_reader.MisspellingReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, tokenizer=args.tokenizer, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: fetch_list = [ graph_vars["num_infer"].name, graph_vars["num_label"].name, graph_vars["num_correct"].name, graph_vars["loss"].name, graph_vars['learning_rate'].name, ] out = train_exe.run(fetch_list=fetch_list) num_infer, num_label, num_correct, np_loss, np_lr = out lr = float(np_lr[0]) loss = np_loss.mean() precision, recall, f1 = calculate_f1( num_label, num_infer, num_correct) if args.verbose: log.info( "train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(), lr if warmup_steps > 0 else args.learning_rate)) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin log.info( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "f1: %f, precision: %f, recall: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, loss, f1, precision, recall, args.skip_steps / used_time)) time_begin = time.time() if nccl2_trainer_id == 0 and steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) latest_path = os.path.join( args.checkpoints, "latest" ) # Always save the current copy and cover with the latest copy fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, latest_path, train_program) if nccl2_trainer_id == 0 and steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # evaluate test set if args.do_test: predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if nccl2_trainer_id == 0 and args.do_val: current_example, current_epoch = reader.get_train_progress() evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, 'final') if nccl2_trainer_id == 0 and args.do_test: current_example, current_epoch = reader.get_train_progress() predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, 'final')
def train(args): ernie_config = ErnieConfig(args.ernie_config) ernie_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, num_seqs = create_model( ernie_config=ernie_config, is_training=True) scheduled_lr, loss_scaling = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, unique_ids, start_logits, end_logits, num_seqs = create_model( ernie_config=ernie_config, is_training=False) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, exec_strategy=exec_strategy) train_data_loader.set_batch_generator(train_data_generator, place) train_data_loader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: if args.use_fp16: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name, loss_scaling.name ] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if args.use_fp16: np_loss, np_lr, np_num_seqs, np_scaling = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f " % np_lr[0] if args.use_fp16: verbose += ", loss scaling: %f" % np_scaling[0] print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0 or steps == max_train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_data_loader.reset() break if args.do_predict: input_files = [] for input_pattern in args.predict_file: input_files.extend(glob.glob(input_pattern)) assert len(input_files) > 0, 'Can not find predict_file {}'.format( args.predict_file) for input_file in input_files: print('Run prediction on {}'.format(input_file)) prefix = os.path.basename(input_file) prefix = re.sub('.json', '', prefix) test_data_loader.set_batch_generator( processor.data_generator(data_path=input_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1), place) predict(exe, test_prog, test_data_loader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, prefix=prefix)
def main(args): """main function""" bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() paradigm_inst = define_paradigm.Paradigm(task_name) processors = { 'udc': reader.UDCProcessor, 'swda': reader.SWDAProcessor, 'mrda': reader.MRDAProcessor, 'atis_slot': reader.ATISSlotProcessor, 'atis_intent': reader.ATISIntentProcessor, 'dstc2': reader.DSTC2Processor, } in_tokens = { 'udc': True, 'swda': True, 'mrda': True, 'atis_slot': False, 'atis_intent': True, 'dstc2': True, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=in_tokens[task_name], task_name=task_name, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True) num_train_examples = processor.get_num_examples(phase='train') if in_tokens[task_name]: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() if args.random_seed is not None: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): results = create_model( args, pyreader_name='train_reader', bert_config=bert_config, num_labels=num_labels, paradigm_inst=paradigm_inst) train_pyreader = results.get("pyreader", None) loss = results.get("loss", None) probs = results.get("probs", None) accuracy = results.get("accuracy", None) num_seqs = results.get("num_seqs", None) scheduled_lr = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) if accuracy is not None: skip_opt_set = [loss.name, probs.name, accuracy.name, num_seqs.name] else: skip_opt_set = [loss.name, probs.name, num_seqs.name] fluid.memory_optimize( input_program=train_program, skip_opt_set=skip_opt_set) if args.verbose: if in_tokens[task_name]: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_results = create_model( args, pyreader_name='test_reader', bert_config=bert_config, num_labels=num_labels, paradigm_inst=paradigm_inst) test_pyreader = test_results.get("pyreader", None) loss = test_results.get("loss", None) probs = test_results.get("probs", None) accuracy = test_results.get("accuracy", None) num_seqs = test_results.get("num_seqs", None) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() ce_info = [] while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: if accuracy is not None: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [loss.name, num_seqs.name] else: if accuracy is not None: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [loss.name, scheduled_lr.name, num_seqs.name] else: fetch_list = [] if accuracy is not None: fetch_test_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_test_list = [loss.name, num_seqs.name] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: if accuracy is not None: np_loss, np_acc, np_num_seqs = outputs else: np_loss, np_num_seqs = outputs else: if accuracy is not None: np_loss, np_acc, np_lr, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if accuracy is not None: total_acc.extend(np_acc * np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size() verbose += "learning rate: %f" % ( np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = processor.get_train_progress() time_end = time.time() used_time = time_end - time_begin current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if accuracy is not None: print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_time, current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time]) else: print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "speed: %f steps/s" % (current_time, current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time]) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: #evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev") #evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_train and args.enable_ce: card_num = get_cards() print("zytest_card_num", card_num) ce_loss = 0 ce_acc = 0 ce_time = 0 try: ce_loss = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: print("ce info error") print("kpis\teach_step_duration_%s_card%s\t%s" % (task_name, card_num, ce_time)) print("kpis\ttrain_loss_%s_card%s\t%f" % (task_name, card_num, ce_loss)) print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc)) #final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) print("Final validation result:") evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev") #final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) print("Final test result:") evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test")
def main(args): """main function""" ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size is None: args.predict_batch_size = args.batch_size if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=dev_count, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() """ if args.random_seed is not None and args.enable_ce: train_program.random_seed = args.random_seed """ with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_test: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) steps = 10000 current_epoch = 1 if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 current_epoch = 0 previous_eval_acc = 0.80 previous_train_acc = 0.90 while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric, is_classify=args.is_classify, is_regression=args.is_regression) acc = outputs["accuracy"] if acc > previous_train_acc or acc > 0.95: print( "previous train accuracy is %f and current train accuracy is %f " % (previous_train_acc, acc)) previous_train_acc = acc eval_acc = evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( "previous evaluate accuracy is %f and current evaluate accuracy is %f " % (previous_eval_acc, eval_acc)) if eval_acc > previous_eval_acc: previous_eval_acc = eval_acc save_path = os.path.join( args.checkpoints, "evalacc_" + str(eval_acc).split('.')[1]) fluid.io.save_persistables(exe, save_path, train_program) predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps="evalacc_" + str(eval_acc).split('.')[1]) print( "predict and save model!!!!!!!!!!!!!!!!!!!!!!!!!! in %s" % (save_path)) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) time_begin = time.time() # if steps % args.save_steps == 0: # save_path = os.path.join(args.checkpoints, # "step_" + str(steps)) # fluid.io.save_persistables(exe, save_path, train_program) # if steps % args.validation_steps == 0 or last_epoch != current_epoch: # # evaluate dev set # if args.do_val: # ret=evaluate_wrapper(args, reader, exe, test_prog, # test_pyreader, graph_vars, # current_epoch, steps) # if args.do_test: # predict_wrapper(args, reader, exe, # test_prog, test_pyreader, graph_vars, # current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set # if args.do_val: # evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, # graph_vars, current_epoch, steps) # final eval on test set steps = 0 # if args.do_test: # current_epoch = 0 # predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, # current_epoch, steps) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator(args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) print("Final diagnostic") qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, is_classify=args.is_classify, is_regression=args.is_regression) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds)) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) print("Done final diagnostic, saving to {}".format( args.diagnostic_save))
def train(args): print("pretraining start") bert_config = BertConfig(args.bert_config_path) bert_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = DataReader(data_dir=args.data_dir, batch_size=args.batch_size, in_tokens=args.in_tokens, vocab_path=args.vocab_path, voc_size=bert_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if not sys.platform == "win32": build_strategy.num_trainers = nccl2_num_trainers elif nccl2_num_trainers > 1: raise ValueError( "Windows platform doesn't support distributed training!") build_strategy.trainer_id = nccl2_trainer_id # use_ngraph is for CPU only, please refer to README_ngraph.md for details use_ngraph = os.getenv('FLAGS_use_ngraph') if not use_ngraph: train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=total_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) if args.validation_set_dir and args.validation_set_dir != "": predict = predict_wrapper(args, exe, bert_config, test_prog=test_prog, data_loader=test_data_loader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_data_loader.set_batch_generator(data_reader.data_generator()) train_data_loader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += 1 skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) continue if steps % args.skip_steps != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) else: fetch_list = [ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ] if args.use_fp16: fetch_list.append(loss_scaling.name) if use_ngraph: outputs = exe.run(fetch_list=fetch_list, program=train_program) else: outputs = exe.run(fetch_list=fetch_list, program=train_compiled_program) if args.use_fp16: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr, np_scaling = outputs else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = outputs acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file = data_reader.get_progress( ) if args.verbose: verbose = "feed_queue size: %d, " % train_data_loader.queue.size( ) verbose += "current learning_rate: %f, " % np_lr[0] if args.use_fp16: verbose += "loss scaling: %f" % np_scaling[0] print(verbose) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if args.validation_set_dir and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_data_loader.reset() break
def train(args): if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") xlnet_config = XLNetConfig(args.model_config_path) xlnet_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor(spiece_model_file=args.spiece_model_file, uncased=args.uncased, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.train_batch_size, phase='train', shuffle=True, dev_count=dev_count, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') print("Device count: %d" % dev_count) print("Max num of epoches: %d" % args.epoch) print("Num of train examples: %d" % num_train_examples) print("Num of train steps: %d" % args.train_steps) print("Num of warmup steps: %d" % args.warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss = create_model( xlnet_config=xlnet_config, is_training=True) scheduled_lr = optimization( loss=loss, warmup_steps=args.warmup_steps, num_train_steps=args.train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, lr_layer_decay_rate=args.lr_layer_decay_rate, scheduler=args.lr_scheduler) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, predictions = create_model( xlnet_config=xlnet_config, is_training=False) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() # These two flags must be set in this model for correctness build_strategy.fuse_all_optimizer_ops = True build_strategy.enable_inplace = False train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy, main_program=train_program) train_data_loader.set_batch_generator(train_data_generator, place) train_data_loader.start() steps = 0 total_cost = [] time_begin = time.time() print("Begin to train model ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) while steps < args.train_steps: try: steps += 1 if steps % args.skip_steps == 0: fetch_list = [loss.name, scheduled_lr.name] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: np_loss, np_lr = outputs total_cost.extend(np_loss) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f " % np_lr[0] print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.mean(total_cost), args.skip_steps / used_time)) total_cost = [] time_begin = time.time() if steps % args.save_steps == 0 or steps == args.train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_data_loader.reset() break print("Finish model training ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) if args.do_predict: print("Begin to do prediction ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) test_data_loader.set_batch_generator( processor.data_generator(data_path=args.predict_file, batch_size=args.predict_batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1), place) predict(exe, test_prog, test_data_loader, [ predictions['unique_ids'].name, predictions['start_top_log_probs'].name, predictions['start_top_index'].name, predictions['end_top_log_probs'].name, predictions['end_top_index'].name, predictions['cls_logits'].name ], processor, name='') print("Finish prediction ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor( vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length, adv_text_path=args.adv_text_path) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( pyreader_name='train_reader', bert_config=bert_config, is_training=True) scheduled_lr = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( pyreader_name='test_reader', bert_config=bert_config, is_training=False) fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name, start_logits.name, end_logits.name, num_seqs.name]) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() best_f1 = -1 while steps < max_train_steps: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if (steps % args.save_steps == 0 or steps == max_train_steps) and steps > int(max_train_steps/3.0): #if (steps % args.save_steps == 0 or steps == max_train_steps): if args.do_predict: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) adv_f1 = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor) # print(adv_f1) # continue # if steps != max_train_steps: if adv_f1 > best_f1: best_f1 = adv_f1 save_path = os.path.join(args.checkpoints, "step_best") print("best adv model saved") # else: # save_path = os.path.join(args.checkpoints, # "step_last") fluid.io.save_persistables(exe, save_path, train_program) test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file.replace("dev", "test"), batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) test_f1 = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, args.predict_file.replace("dev", "test")) print("This is the test score.") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict and not args.do_train: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor)
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = reader_de_infer.ClassifyReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, q_max_seq_len=args.q_max_seq_len, p_max_seq_len=args.p_max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, for_cn=args.for_cn, task_id=args.task_id) assert args.test_save is not None startup_prog = fluid.Program() test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, batch_size=args.batch_size, is_prediction=True) test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(place) exe.run(startup_prog) if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) test_sets = args.test_set.split(',') save_dirs = args.test_save.split(',') assert len(test_sets) == len(save_dirs) batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size for test_f, save_f in zip(test_sets, save_dirs): test_pyreader.decorate_tensor_provider( reader.data_generator(test_f, batch_size=batch_size, epoch=1, dev_count=1, shuffle=False)) save_path = save_f log.info("testing {}, save to {}".format(test_f, save_path)) predict(args, exe, test_prog, test_pyreader, graph_vars, output_item=args.output_item, output_file_name=args.output_file_name, hidden_size=ernie_config['hidden_size'])
def main(args): """main""" model_config = UNIMOConfig(args.unimo_config_path) model_config.print_config() gpu_id = 0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None: gpu_list = os.getenv("FLAGS_selected_gpus").split(",") gpus = len(gpu_list) gpu_id = int(gpu_list[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = gpus else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file, encoder_json_file=args.encoder_json_file, vocab_bpe_file=args.vocab_bpe_file, do_lower_case=args.do_lower_case) if not (args.do_train or args.do_val or args.do_test or args.do_test_hard): raise ValueError( "For args `do_train`, `do_val`, `do_test`, `do_test_hard`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) if args.do_train: train_data_reader = ClassifyReader(args.train_filelist, args.max_seq_len, tokenizer) train_data_generator = train_data_reader.data_generator( batch_size=args.batch_size, epoch=args.epoch, phase="train") if args.num_train_examples: num_train_examples = args.num_train_examples else: num_train_examples = train_data_reader.get_num_examples() step_num_per_epoch = num_train_examples // args.batch_size // trainers_num max_train_steps = args.epoch * step_num_per_epoch warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id)) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, config=model_config, pyreader_name="train_reader", is_train=True) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon) if args.do_val or args.do_test or args.do_test_hard: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = create_model( args, config=model_config, pyreader_name="dev_reader", is_train=False) test_prog = test_prog.clone(for_test=True) if args.do_val: dev_data_reader = ClassifyReader(args.dev_filelist, args.max_seq_len, tokenizer) dev_data_generator = dev_data_reader.data_generator( batch_size=args.test_batch_size, epoch=1, phase="dev") if args.do_test: test_data_reader = ClassifyReader(args.test_filelist, args.max_seq_len, tokenizer) test_data_generator = test_data_reader.data_generator( batch_size=args.test_batch_size, epoch=1, phase="test") if args.do_test_hard: test_hard_data_reader = ClassifyReader(args.test_hard_filelist, args.max_seq_len, tokenizer) test_hard_data_generator = test_hard_data_reader.data_generator( batch_size=args.test_batch_size, epoch=1, phase="test_hard") nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" if args.nccl_comm_num > 1: config.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks: config.use_hierarchical_allreduce = args.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks assert config.hierarchical_allreduce_inter_nranks > 1 assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0 config.hierarchical_allreduce_exter_nranks = \ trainers_num / config.hierarchical_allreduce_inter_nranks t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=train_program) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=train_program) elif args.do_val or args.do_test or args.do_test_hard: args.init_checkpoint = args.init_pretraining_params if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 if args.use_fp16 else 2 exec_strategy.num_iteration_per_drop_scope = min( args.num_iteration_per_drop_scope, args.skip_steps) build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False if args.use_fuse: build_strategy.fuse_all_reduce_ops = True train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test or args.do_test_hard: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) dev_ret_history = [] # (steps, key_eval, eval) test_ret_history = [] # (steps, key_eval, eval) test_hard_ret_history = [] # (steps, key_eval, eval) steps = 0 if args.do_train: train_pyreader.start() time_begin = time.time() skip_steps = args.skip_steps while True: try: steps += 1 if steps % skip_steps == 0: train_fetch_list = [ graph_vars["loss"].name, scheduled_lr.name ] res = train_exe.run(fetch_list=train_fetch_list) outputs = { "loss": np.mean(res[0]), 'learning_rate': float(res[1][0]) } if args.verbose: verbose = "train pyreader queue size: %d, learning_rate: %.10f" % \ (train_pyreader.queue.size(), outputs['learning_rate']) print(verbose) current_epoch, current_example, current_file_index, total_file, current_file = \ train_data_reader.get_progress() time_end = time.time() used_time = time_end - time_begin print("%s - epoch: %d, progress: %d/%d, %d/%d, step: %d, ave loss: %f, speed: %f steps/s" % \ (get_time(), current_epoch, current_example, num_train_examples, current_file_index, \ total_file, steps, outputs["loss"], args.skip_steps / used_time)) time_begin = time.time() else: train_exe.run(fetch_list=[]) if nccl2_trainer_id == 0: if steps % args.save_steps == 0 and args.save_checkpoints: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( dev_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \ "dev", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: dev_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( test_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \ "test", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: test_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # evaluate test set if args.do_test_hard: test_pyreader.decorate_tensor_provider( test_hard_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \ "test_hard", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: test_hard_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) except fluid.core.EOFException: if args.save_checkpoints: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider(dev_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, "dev", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: dev_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider(test_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, "test", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: test_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) # final eval on test_hard set if args.do_test_hard: test_pyreader.decorate_tensor_provider(test_hard_data_generator) outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, "test_hard", trainers_num, nccl2_trainer_id) if nccl2_trainer_id == 0: test_hard_ret_history.append( (steps, outputs['key_eval'], outputs[outputs['key_eval']])) if nccl2_trainer_id == 0: if args.do_val: dev_ret_history = sorted(dev_ret_history, key=lambda a: a[2], reverse=True) print("Best validation result: step %d %s %f" % \ (dev_ret_history[0][0], dev_ret_history[0][1], dev_ret_history[0][2]))
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = reader_ce.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, total_num=args.train_data_size, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size == None: args.predict_batch_size = args.batch_size if args.do_train: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fleet.worker_num() train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=1, trainer_id=fleet.worker_index(), trainer_num=fleet.worker_num(), shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() # use fleet api exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count if args.is_distributed: exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 1 if args.is_distributed: dist_strategy.nccl_comm_num = 2 dist_strategy.use_hierarchical_allreduce = True if args.use_mix_precision: dist_strategy.use_amp = True with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, dist_strategy=dist_strategy) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_prediction=True) test_prog = test_prog.clone(for_test=True) train_program = fleet.main_program exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: train_exe = exe train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe # if args.do_val or args.do_test: # if args.use_multi_gpu_test: # test_exe = fluid.ParallelExecutor( # use_cuda=args.use_cuda, # main_program=test_prog, # share_vars_from=train_exe) current_epoch = 0 steps = 0 if args.do_train: train_pyreader.start() if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 while True: try: steps += 1 # log.info("step: %d" % steps) if fleet.worker_index() != 0: train_exe.run(fetch_list=[], program=train_program) continue if steps % args.skip_steps != 0: train_exe.run(fetch_list=[], program=train_program) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example * dev_count, num_train_examples, steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) # if steps % args.validation_steps == 0 or last_epoch != current_epoch: if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) train_pyreader.reset() break # final eval on dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on test set if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator(args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) log.info("Final diagnostic") qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds)) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) log.info("Done final diagnostic, saving to {}".format( args.diagnostic_save))
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict or args.do_val): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) wn_id2concept, wn_concept2id, wn_concept_embedding_mat = read_concept_embedding( args.wn_concept_embedding_path) nell_id2concept, nell_concept2id, nell_concept_embedding_mat = read_concept_embedding( args.nell_concept_embedding_path) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed random.seed(args.random_seed) np.random.seed(args.random_seed) if args.do_train: train_concept_settings = { 'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/train.tokenization.{}.data' .format('uncased' if args.do_lower_case else 'cased'), 'wn_concept2id': wn_concept2id, 'nell_concept2id': nell_concept2id, 'use_wordnet': args.use_wordnet, 'retrieved_synset_path': args.retrieved_synset_path, 'use_nell': args.use_nell, 'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path, } train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch, **train_concept_settings) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) logger.info("Device count: %d" % dev_count) logger.info("Num train examples: %d" % num_train_examples) logger.info("Max train steps: %d" % max_train_steps) logger.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() # if args.random_seed is not None: # train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( pyreader_name='train_reader', bert_config=bert_config, max_wn_concept_length=processor. train_wn_max_concept_length, max_nell_concept_length=processor. train_nell_max_concept_length, wn_concept_embedding_mat=wn_concept_embedding_mat, nell_concept_embedding_mat=nell_concept_embedding_mat, is_training=True, freeze=args.freeze) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) if args.use_ema: ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) ema.update() fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) logger.info( "Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict or args.do_val: eval_concept_settings = { 'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/dev.tokenization.{}.data' .format('uncased' if args.do_lower_case else 'cased'), 'wn_concept2id': wn_concept2id, 'nell_concept2id': nell_concept2id, 'use_wordnet': args.use_wordnet, 'retrieved_synset_path': args.retrieved_synset_path, 'use_nell': args.use_nell, 'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path, } eval_data_generator = processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1, **eval_concept_settings) test_prog = fluid.Program() # if args.random_seed is not None: # test_prog.random_seed = args.random_seed with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( pyreader_name='test_reader', bert_config=bert_config, max_wn_concept_length=processor. predict_wn_max_concept_length, max_nell_concept_length=processor. predict_nell_max_concept_length, wn_concept_embedding_mat=wn_concept_embedding_mat, nell_concept_embedding_mat=nell_concept_embedding_mat, is_training=False) if args.use_ema and 'ema' not in dir(): ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) fluid.memory_optimize(test_prog, skip_opt_set=[ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ]) test_prog = test_prog.clone(for_test=True) # if args.random_seed is not None: # test_prog.random_seed = args.random_seed exe.run(startup_prog) if args.do_train: logger.info('load pretrained concept embedding') fluid.global_scope().find_var('wn_concept_emb_mat').get_tensor().set( wn_concept_embedding_mat, place) fluid.global_scope().find_var('nell_concept_emb_mat').get_tensor().set( nell_concept_embedding_mat, place) if args.init_checkpoint and args.init_pretraining_params: logger.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict or args.do_val: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while steps < max_train_steps: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) logger.info(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() logger.info( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0 or steps == max_train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0 or steps == max_train_steps: if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1, **eval_concept_settings)) val_performance = predict( exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps)) logger.info( "Validation performance after step {}:\n* Exact_match: {}\n* F1: {}" .format(steps, val_performance['exact_match'], val_performance['f1'])) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict: test_pyreader.decorate_tensor_provider(eval_data_generator) if args.use_ema: with ema.apply(exe): eval_performance = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings) else: eval_performance = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings) logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format( eval_performance['exact_match'], eval_performance['f1']))
def main(args): """main""" reader = task_reader.RoleSequenceLabelReader( vocab_path=args.vocab_path, labels_map=labels_map, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: fetch_list = [ graph_vars["num_infer"].name, graph_vars["num_label"].name, graph_vars["num_correct"].name, graph_vars["loss"].name, graph_vars['learning_rate'].name, ] out = train_exe.run(fetch_list=fetch_list) num_infer, num_label, num_correct, np_loss, np_lr = out lr = float(np_lr[0]) loss = np_loss.mean() precision, recall, f1 = calculate_f1( num_label, num_infer, num_correct) if args.verbose: print( "train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(), lr if warmup_steps > 0 else args.learning_rate)) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( u"【train】epoch: {}, step: {}, loss: {:.6f}, " "f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, speed: {:.3f} steps/s" .format(current_epoch, steps, float(loss), float(f1), float(precision), float(recall), args.skip_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: precision, recall, f1 = evaluate_wrapper( reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}" .format(float(precision), float(recall), float(f1))) # evaluate test set if args.do_test: precision, recall, f1 = evaluate_wrapper( reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( u"【test】precision {:.4f} , recall {:.4f}, f1-score {:.4f}" .format(float(precision), float(recall), float(f1))) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "final_model") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: precision, recall, f1 = evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') print(u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}".format( float(precision), float(recall), float(f1))) if args.do_test: test_ret = predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') utils.write_by_lines(args.trigger_pred_save_path, test_ret)
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, probs, accuracy, num_seqs = create_model( args, pyreader_name='train_reader', bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ loss.name, probs.name, accuracy.name, num_seqs.name ]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, loss, probs, accuracy, num_seqs = create_model( args, pyreader_name='test_reader', bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_acc, np_num_seqs = outputs else: np_loss, np_acc, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = processor.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator(batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) print("Final validation result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) print("Final test result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test")
def main(args): """main""" model_config = UNIMOConfig(args.unimo_config_path) model_config.print_config() gpu_id = 0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None: gpu_list = os.getenv("FLAGS_selected_gpus").split(",") gpus = len(gpu_list) gpu_id = int(gpu_list[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = gpus else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file, encoder_json_file=args.encoder_json_file, vocab_bpe_file=args.vocab_bpe_file, do_lower_case=args.do_lower_case) data_reader = RegressionReader(tokenizer, args) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) train_data_generator = data_reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=trainers_num, shuffle=True, phase="train") num_train_examples = data_reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // trainers_num else: max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id)) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', config=model_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test or args.do_pred: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', config=model_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" if args.nccl_comm_num > 1: config.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks: config.use_hierarchical_allreduce = args.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks assert config.hierarchical_allreduce_inter_nranks > 1 assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0 config.hierarchical_allreduce_exter_nranks = \ trainers_num / config.hierarchical_allreduce_inter_nranks t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=train_program) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=train_program) elif args.do_val or args.do_test or args.do_pred: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_test or args.do_pred: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) dev_ret_history = [] # (steps, key_eval, eval) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() skip_steps = args.skip_steps while True: try: steps += 1 if steps % skip_steps == 0: train_fetch_list = [ graph_vars["loss"].name, ] if "learning_rate" in graph_vars: train_fetch_list.append(graph_vars["learning_rate"].name) res = train_exe.run(fetch_list=train_fetch_list) outputs = {"loss": np.mean(res[0])} if "learning_rate" in graph_vars: outputs["learning_rate"] = float(res[1][0]) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = data_reader.get_train_progress() time_end = time.time() used_time = time_end - time_begin print("%s - epoch: %d, progress: %d/%d, step: %d, ave loss: %f, speed: %f steps/s" % \ (get_time(), current_epoch, current_example, num_train_examples, steps, \ outputs["loss"], args.skip_steps / used_time)) time_begin = time.time() else: train_exe.run(fetch_list=[]) if nccl2_trainer_id == 0: if steps % args.save_steps == 0 and args.save_checkpoints: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "dev") dev_ret_history.append((steps, outputs['key_eval'], outputs[outputs['key_eval']])) # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "test") if args.do_pred: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1) save_path = args.pred_save + '.test.' + str(steps) + '.txt' print("testing {}, save to {}".format(args.test_set, save_path)) with open(save_path, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) except fluid.core.EOFException: if args.save_checkpoints: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if nccl2_trainer_id == 0: # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) print("Final validation result:") outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "dev") dev_ret_history.append((steps, outputs['key_eval'], outputs[outputs['key_eval']])) dev_ret_history = sorted(dev_ret_history, key=lambda a: a[2], reverse=True) print("Best validation result: step %d %s %f" \ % (dev_ret_history[0][0], dev_ret_history[0][1], dev_ret_history[0][2])) # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) print("Final test result:") outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "test") # final eval on test set if args.do_pred: test_pyreader.decorate_tensor_provider( data_reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1) save_path = args.pred_save + '.' + str(steps) + '.txt' print("testing {}, save to {}".format(args.test_set, save_path)) with open(save_path, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p))
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.SequenceLabelReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize( input_program=train_program, skip_opt_set=[ graph_vars["loss"].name, graph_vars["labels"].name, graph_vars["infers"].name, graph_vars["seq_lens"].name ]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr if args.save_log and args.log_path: if os.path.exists(args.log_path): raise FileExistsError("Logging file already exists!") with open(args.log_path, 'w') as logfile: logfile.write('%s\n' % time.asctime()) print('Writing logs into %s' % args.log_path) time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, args.num_labels, "train", dev_count) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["lr"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = reader.get_train_progress() time_end = time.time() used_time = time_end - time_begin print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "f1: %f, precision: %f, recall: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["f1"], outputs["precision"], outputs["recall"], args.skip_steps / used_time)) if args.save_log and args.log_path: with open(args.log_path, 'a') as logfile: logfile.write("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "f1: %f, precision: %f, recall: %f\n" % ( current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["f1"], outputs["precision"], outputs["recall"])) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, shuffle=False)) print("Final validation result:") evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") if args.do_predict: print("Saving predicted results...") predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config, "test", output_dir="./predicted_results") # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, shuffle=False)) print("Final test result:") evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test") if args.do_predict: print("Saving predicted results...") predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config, "test", output_dir="./predicted_results")
def main(args): """main""" ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.RankReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id, ) if not (args.do_train or args.do_val or args.do_test): raise ValueError( "For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.", ) if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=dev_count, shuffle=True, phase="train", ) num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, \ got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() if args.random_seed is not None and args.enable_ce: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression, ) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, ) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len, ) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size, ) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression, ) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog, ) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.", ) if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16, ) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16, ) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError( "args 'init_checkpoint' should be set if" "only doing validation or testing!", ) init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16, ) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id, ) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_test: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe, ) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 current_epoch = 0 while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate( train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric, is_classify=args.is_classify, is_regression=args.is_regression, ) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin if args.is_classify: log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % ( current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs['acc'], args.skip_steps / used_time, ), ) ce_info.append([outputs["loss"], used_time], ) if args.is_regression: log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " " speed: %f steps/s" % ( current_epoch, current_example, num_train_examples, steps, outputs["loss"], args.skip_steps / used_time, ), ) time_begin = time.time() if nccl2_trainer_id == 0: if steps % args.save_steps == 0: save_path = os.path.join( args.checkpoints, "step_" + str(steps), ) fluid.io.save_persistables( exe, save_path, train_program, ) if steps % args.validation_steps == 0 or last_epoch != current_epoch: # evaluate dev set if args.do_val: evaluate_wrapper( args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps, ) if args.do_test: predict_wrapper( args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps, ) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join( args.checkpoints, "step_" + str(steps), ) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.enable_ce: card_num = get_cards() ce_loss = 0 ce_acc = 0 ce_time = 0 try: ce_loss = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: log.info("ce info error") log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time)) log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss)) log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc)) # final eval on dev set if args.do_val: evaluate_wrapper( args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps, ) # final eval on test set if args.do_test: predict_wrapper( args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps, ) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator( args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, ), ) log.info("Final diagnostic") qids, preds, probs = predict( test_exe, test_prog, test_pyreader, graph_vars, is_classify=args.is_classify, is_regression=args.is_regression, ) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds), ) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) log.info("Done final diagnostic, saving to {}".format( args.diagnostic_save, ))
def train(args): log.info("pretraining start") profile = False place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) # set seed random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) # define execution strategy exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 2 exec_strategy.num_iteration_per_drop_scope = 1 # define distribution strategy dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.nccl_comm_num = 3 if args.use_recompute: log.info("using recompute.") dist_strategy.recompute = args.use_recompute dist_strategy.sharding = args.use_sharding dist_strategy.pipeline = args.num_pp > 1 # define topology structure for dp/pp/mp topo = Topology(rank=fleet.worker_index(), world_size=fleet.worker_num(), dp=args.num_dp, pp=args.num_pp, sharding=args.num_sharding, mp=args.num_mp) is_last = False if topo.pp.rank == (topo.pp.size - 1): is_last = True dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank dp_worldsize = topo.dp.size * topo.sharding.size bsz_per_dp = args.global_bsz // dp_worldsize micro_bsz = args.micro_bsz assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}" acc_steps = bsz_per_dp // micro_bsz # sharding \ model parallel \ pipeline assert dist_strategy.sharding == True dist_strategy.sharding_configs = { "segment_broadcast_MB": 32, "sharding_degree": args.num_sharding, "mp_degree": args.num_mp, "pp_degree": args.num_pp, "dp_degree": args.num_dp, "optimize_offload": True, } dist_strategy.pipeline_configs = { "schedule_mode": "1F1B", "micro_batch_size": micro_bsz, "accumulate_steps": acc_steps, } log.info( f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}" ) dist_strategy.amp = args.use_amp dist_strategy.amp_configs = { "custom_white_list": ['softmax', 'layer_norm', 'gelu'], "init_loss_scaling": 32768, "decr_every_n_nan_or_inf": 2, "incr_every_n_steps": 1000, "incr_ratio": 2.0, "use_dynamic_loss_scaling": True, "decr_ratio": 0.5, "use_pure_fp16": False, "use_fp16_guard": False, } dist_strategy.lamb = args.use_lamb dist_strategy.lamb_configs = { 'lamb_weight_decay': 0.01, 'exclude_from_weight_decay': ['layer_norm_bias', 'layer_norm_scale', '.b_0'] } train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): graph_vars = create_model(args, 'train', micro_bsz, dp_sharding_rank, dp_worldsize, topo) data_loader = graph_vars['data_loader'] for op in train_program.global_block().ops: if op.type == 'fill_constant': op._set_attr( 'op_device', "gpu:0" ) # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376 if args.use_recompute: dist_strategy.recompute_configs = { "checkpoints": graph_vars['checkpoints'], # "enable_offload": args.use_offload, # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096], } log.debug("base lr: {}".format(args.learning_rate)) scheduled_lr = linear_warmup_decay( learning_rate=args.learning_rate, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps) clip_norm_thres = 1.0 if paddlenlp.ops.optimizer._jit_compile(): optimizer = paddlenlp.ops.optimizer.AdamwOptimizer( learning_rate=scheduled_lr, grad_clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres), weight_decay=args.weight_decay, apply_decay_param_fun=apply_weight_decay_fun) else: optimizer = fluid.optimizer.Adam( learning_rate=scheduled_lr, grad_clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres), #multi_precision=True, #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248 #exclude_from_weight_decay_fn=exclude_from_weight_decay ) optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) log.info(f"using dist strategy: {dist_strategy}") optimizer.minimize(graph_vars['total_loss']) final_strategy = fleet._final_strategy() applied_meta_list = fleet._get_applied_meta_list() log.info("final strategy: {}".format(final_strategy)) log.info("applied_meta_list: {}".format(applied_meta_list)) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open( program_desc_dir + "/main_program.txt.%d" % (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(train_program)) with open( program_desc_dir + "/startup_program.txt.%d" % (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(startup_program)) exe = fluid.Executor(place) exe.run(startup_program) optimizer.amp_init(place) #save_path = os.path.join(args.output_dir, 'step_0') #log.debug("saving models to {}".format(save_path)) #save_persistables(exe, save_path, train_program) if args.init_checkpoint and args.init_checkpoint != "": log.info(' ') log.info( '############################WARNING############################') log.info( '####### using ini_checkpoint, not init_pretraining_params ####') log.info( '## meaning hyper param e.g. lr will inherit from checkpoint ##') log.info( '###############################################################') init_checkpoint(exe, args.init_checkpoint, train_program) log.info(' ') output_dir = args.output_dir save_steps = args.save_steps total_time = 0 cost_vals, lm_losses, sop_accs = [], [], [] global_steps = args.global_steps + 1 steps = 0 log_path = 'train_log/node-%d' % fleet.worker_index() start_time = time.time() with LogWriter(os.path.join(args.output_dir, log_path)) as swriter: data_loader.start() while True: #if steps < global_steps: # steps += 1 # continue if not is_last: fetch_list = [] else: fetch_list = [ graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'], scheduled_lr ] if args.use_sop: fetch_list.extend( [graph_vars['sop_acc'], graph_vars['sop_loss']]) if args.use_amp: loss_scaling = train_program.global_block( ).vars['loss_scaling_0'] fetch_list.append(loss_scaling) ret = exe.run(train_program, fetch_list=fetch_list ) # run one mini-batch(=acc_steps micro-batch) #use_program_cache=True) steps += 1 if is_last: if args.use_sop and args.use_amp: cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret elif args.use_sop: cost_val, lm_loss, lr, sop_acc, sop_loss = ret elif args.use_amp: cost_val, lm_loss, lr, loss_scaling_0 = ret else: cost_val, lm_loss, lr = ret cost_vals.append(cost_val[0]) lm_losses.append(lm_loss[0]) if args.use_sop: sop_accs.append(sop_acc[0]) if steps > 0 and (steps % args.log_steps) == 0: end_time = time.time() total_time = end_time - start_time cost_val = np.mean(cost_vals) lm_loss = np.mean(lm_losses) swriter.add_scalar('loss/total_loss', cost_val, steps) swriter.add_scalar('loss/mlm_loss', lm_loss, steps) swriter.add_scalar('lr/scheduled_lr', lr[0], steps) if args.use_sop: sop_acc = np.mean(sop_accs) swriter.add_scalar('loss/sop_loss', sop_loss, steps) swriter.add_scalar('train/sop_acc', sop_acc, steps) else: sop_acc = 0.0 if args.use_amp: swriter.add_scalar('lr/loss_scaling', loss_scaling_0[0], steps) else: loss_scaling_0 = [0.0] log.info( "worker_index: %d, step: %d, cost: %f, " "mlm loss: %f, sentence order acc: %f, " "speed: %f steps/s, " "speed: %f samples/s, " "speed: %f tokens/s, " "learning rate: %.3e, loss_scalings: %f" % (fleet.worker_index(), steps, cost_val, lm_loss, sop_acc, args.log_steps / total_time, args.log_steps * args.global_bsz / total_time, args.log_steps * args.global_bsz * args.max_seq_len / total_time, lr[0], loss_scaling_0[0])) cost_vals, lm_losses, sop_accs = [], [], [] start_time = time.time() # TODO: add evaluation if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0: pass if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0: if args.use_hybrid_dp and fleet.worker_index() > 8: continue save_path = os.path.join(output_dir, 'step_' + str(steps)) log.debug("saving models to {}".format(save_path)) save_persistables(exe, save_path, train_program) if steps == args.num_train_steps: if args.use_hybrid_dp and fleet.worker_index() > 8: continue save_path = os.path.join(output_dir, 'final_step_' + str(steps)) save_persistables(exe, save_path, train_program) log.debug("saving final models to {}".format(save_path)) log.debug("end of training, total steps: {}".format(steps))
def get_role_init_dict(args, suf): """main""" # log = logging.getLogger() # prepare_logger(log) log = logging.getLogger(__name__) check_cuda(args.use_cuda) labels_map = {} # label for line in utils.read_by_lines(args.label_map_config): arr = line.split("\t") labels_map[arr[0]] = int(arr[1]) args.num_labels = len(labels_map) print("=========ERNIE CONFIG============") ernie_config = ErnieConfig(args.ernie_config_path) # ernie_config.print_config() print("=========ERNIE CONFIG============") if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] print("==============place==================", place) # place = dev_list[1] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("==============place, dev_count==================", place, dev_count) reader = task_reader.RoleSequenceLabelReader( vocab_path=args.vocab_path, labels_map=labels_map, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): # TODO pyreader_name 再次调整为不同 test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader_role' + suf, ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe = fluid.Executor(place) exe.run(startup_prog) if args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) trigger_dict = dict() trigger_dict['log'] = log trigger_dict['args'] = args trigger_dict['labels_map'] = labels_map trigger_dict['ernie_config'] = ernie_config trigger_dict['place'] = place trigger_dict['dev_count'] = dev_count trigger_dict['reader'] = reader trigger_dict['startup_prog'] = startup_prog trigger_dict['test_prog'] = test_prog trigger_dict['test_pyreader'] = test_pyreader trigger_dict['graph_vars'] = graph_vars trigger_dict['nccl2_num_trainers'] = nccl2_num_trainers trigger_dict['nccl2_trainer_id'] = nccl2_trainer_id trigger_dict['exe'] = exe return trigger_dict
def train(args): print("pretraining start") ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization(loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) print("theoretical memory usage: ") if args.in_tokens: print( fluid.contrib.memory_usage(program=train_program, batch_size=args.batch_size // args.max_seq_len)) else: print( fluid.contrib.memory_usage(program=train_program, batch_size=args.batch_size)) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = ErnieDataReader(filelist=args.train_filelist, batch_size=args.batch_size, vocab_path=args.vocab_path, voc_size=ernie_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample, in_tokens=args.in_tokens, is_bidirection=args.is_bidirection) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps) build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=total_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) if args.valid_filelist and args.valid_filelist != "": predict = predict_wrapper(args, exe, ernie_config, test_prog=test_prog, pyreader=test_pyreader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_pyreader.decorate_tensor_provider(data_reader.data_generator()) train_pyreader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += nccl2_num_trainers skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: train_exe.run(fetch_list=[]) continue if steps % skip_steps != 0: train_exe.run(fetch_list=[]) else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ]) acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) print("feed_queue size", train_pyreader.queue.size()) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file, mask_type)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if args.valid_filelist and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_pyreader.reset() break
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.MRCReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id, doc_stride=args.doc_stride, max_query_length=args.max_query_length) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size == None: args.predict_batch_size = args.batch_size if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=dev_count, shuffle=True, phase="train") num_train_examples = reader.get_num_examples("train") if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, is_training=True) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16) """ fluid.memory_optimize( input_program=train_program, skip_opt_set=[ graph_vars["loss"].name, graph_vars["num_seqs"].name, ]) """ if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_training=False) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train") if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], args.skip_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator(args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="dev")) evaluate(exe, test_prog, test_pyreader, test_graph_vars, str(steps) + "_dev", examples=reader.get_examples("dev"), features=reader.get_features("dev"), args=args) if args.do_test: test_pyreader.decorate_tensor_provider( reader.data_generator(args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="test")) evaluate(exe, test_prog, test_pyreader, test_graph_vars, str(steps) + "_test", examples=reader.get_examples("test"), features=reader.get_features("test"), args=args) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: print("Final validation result:") test_pyreader.decorate_tensor_provider( reader.data_generator(args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="dev")) evaluate(exe, test_prog, test_pyreader, test_graph_vars, "dev", examples=reader.get_examples("dev"), features=reader.get_features("dev"), args=args) # final eval on test set if args.do_test: print("Final test result:") test_pyreader.decorate_tensor_provider( reader.data_generator(args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="test")) evaluate(exe, test_prog, test_pyreader, test_graph_vars, "test", examples=reader.get_examples("test"), features=reader.get_features("test"), args=args)
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = get_device_num() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed if args.do_train: # NOTE: If num_trainers > 1, the shuffle_seed must be set, because # the order of batch data generated by reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, shuffle=args.shuffle, shuffle_seed=shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr, loss_scaling = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.do_val: dev_prog = fluid.Program() with fluid.program_guard(dev_prog, startup_prog): with fluid.unique_name.guard(): dev_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) dev_prog = dev_prog.clone(for_test=True) dev_data_loader.set_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='dev', epoch=1, dev_count=1, shuffle=False), place) if args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) test_data_loader.set_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, dev_count=1, shuffle=False), place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if args.use_cuda and num_trainers > 1: assert shuffle_seed is not None dist_utils.prepare_for_multi_process(exe, build_strategy, train_program) train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator) train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy) train_data_loader.set_batch_generator(train_data_generator, place) if args.do_train: train_data_loader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() throughput = [] ce_info = [] total_batch_num = 0 # used for benchmark while True: try: steps += 1 total_batch_num += 1 # used for benchmark if args.max_iter and total_batch_num == args.max_iter: # used for benchmark return if steps % args.skip_steps == 0: if args.use_fp16: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name, loss_scaling.name ] else: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if args.use_fp16: np_loss, np_acc, np_lr, np_num_seqs, np_scaling = outputs else: np_loss, np_acc, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f" % np_lr[0] if args.use_fp16: verbose += ", loss scaling: %f" % np_scaling[0] print(verbose) current_example, current_epoch = processor.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin # profiler tools if args.is_profiler and current_epoch == 0 and steps == args.skip_steps: profiler.start_profiler("All") elif args.is_profiler and current_epoch == 0 and steps == args.skip_steps * 2: profiler.stop_profiler("total", args.profiler_path) return log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}, ave acc: {}".format( current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs)) ce_info.append([ np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), used_time ]) if steps > 0: throughput.append(args.skip_steps / used_time) log_record = log_record + ", speed: %f steps/s" % ( args.skip_steps / used_time) print(log_record) else: print(log_record) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if steps % args.validation_steps == 0: print("Average throughtput: %s" % (np.average(throughput))) throughput = [] # evaluate dev set if args.do_val: evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) train_data_loader.reset() break if args.enable_ce: card_num = get_cards() ce_cost = 0 ce_acc = 0 ce_time = 0 try: ce_cost = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: print("ce info error") print("kpis\ttrain_duration_%s_card%s\t%s" % (args.task_name, card_num, ce_time)) print("kpis\ttrain_cost_%s_card%s\t%f" % (args.task_name, card_num, ce_cost)) print("kpis\ttrain_acc_%s_card%s\t%f" % (args.task_name, card_num, ce_acc)) # final eval on dev set if args.do_val: print("Final validation result:") evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: print("Final test result:") evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test")