def test(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() test_prog = fluid.Program() test_startup = fluid.Program() with fluid.program_guard(test_prog, test_startup): with fluid.unique_name.guard(): test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) test_prog = test_prog.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(test_startup) predict = predict_wrapper( args, exe, bert_config, test_prog=test_prog, data_loader=test_data_loader, fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name]) print("test begin") loss, lm_loss, acc, steps, speed = predict() print( "[test_set] loss: %f, global ppl: %f, next_sent_acc: %f, speed: %f steps/s" % (np.mean(np.array(loss) / steps), np.exp(np.mean(np.array(lm_loss) / steps)), np.mean(np.array(acc) / steps), speed))
def predict(args, cls_model = None): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=False) test_data_generator = processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False) num_labels = len(processor.get_labels()) with fluid.dygraph.guard(place): if cls_model is None: cls_model = ClsModelLayer( args, bert_config, num_labels, is_training=False, return_pooled_out=True) #restore the model save_path = os.path.join(args.checkpoints, "final") print("Load params from %s" % save_path) model_dict,_ = fluid.load_dygraph(save_path) cls_model.load_dict(model_dict) print('Do predicting ...... ') cls_model.eval() total_cost, total_acc, total_num_seqs = [], [], [] for batch in test_data_generator(): data_ids = create_data(batch) np_loss, np_acc, np_num_seqs = cls_model(data_ids) np_loss = np_loss.numpy() np_acc = np_acc.numpy() np_num_seqs = np_num_seqs.numpy() total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) print("[evaluation] average acc: %f" % (np.sum(total_acc) / np.sum(total_num_seqs)))
def net(self, args=None): """ BERT net struct. Args: fleet: args (ArgumentParser): run args to config dist fleet. Returns: tuple: the return value contains avg_cost, py_reader """ args = p_args() bert_config = BertConfig(DATA_DIR + "uncased_L-24_H-1024_A-16/bert_config.json") bert_config.print_config() place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) exe = fluid.Executor(place) # init program train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed != 0: print("set program random seed as: ", args.random_seed) startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) dev_count = 1 self.train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, dev_idx=0, shuffle=args.shuffle, shuffle_seed=args.shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = 5 self.warmup_steps = 0.5 exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() args.run_params = json.loads(args.run_params) dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] dist_strategy.exec_strategy = exec_strategy dist_strategy.use_hierarchical_allreduce = False with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=self.loss, warmup_steps=self.warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=False, loss_scaling=args.loss_scaling, dist_strategy=dist_strategy) exe.run(startup_prog) with open("__model__", "wb") as f: f.write(fleet._origin_program.desc.serialize_to_string()) with open("debug_program", "w") as f: f.write(str(fleet._origin_program)) return self.loss
def train(args): print("pretraining start") bert_config = BertConfig(args.bert_config_path) bert_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = DataReader(data_dir=args.data_dir, batch_size=args.batch_size, in_tokens=args.in_tokens, vocab_path=args.vocab_path, voc_size=bert_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if not sys.platform == "win32": build_strategy.num_trainers = nccl2_num_trainers elif nccl2_num_trainers > 1: raise ValueError( "Windows platform doesn't support distributed training!") build_strategy.trainer_id = nccl2_trainer_id # use_ngraph is for CPU only, please refer to README_ngraph.md for details use_ngraph = os.getenv('FLAGS_use_ngraph') if not use_ngraph: train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=total_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) if args.validation_set_dir and args.validation_set_dir != "": predict = predict_wrapper(args, exe, bert_config, test_prog=test_prog, data_loader=test_data_loader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_data_loader.set_batch_generator(data_reader.data_generator()) train_data_loader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += 1 skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) continue if steps % args.skip_steps != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) else: fetch_list = [ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ] if args.use_fp16: fetch_list.append(loss_scaling.name) if use_ngraph: outputs = exe.run(fetch_list=fetch_list, program=train_program) else: outputs = exe.run(fetch_list=fetch_list, program=train_compiled_program) if args.use_fp16: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr, np_scaling = outputs else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = outputs acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file = data_reader.get_progress( ) if args.verbose: verbose = "feed_queue size: %d, " % train_data_loader.queue.size( ) verbose += "current learning_rate: %f, " % np_lr[0] if args.use_fp16: verbose += "loss scaling: %f" % np_scaling[0] print(verbose) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if args.validation_set_dir and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_data_loader.reset() break
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_test): raise ValueError("For args `do_train`, `do_test`, at " "least one of them must be True.") trainer_count = fluid.dygraph.parallel.Env().nranks task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) shuffle_seed = 1 if trainer_count > 1 else None train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=trainer_count, shuffle=args.shuffle, shuffle_seed=shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = args.epoch * num_train_examples // args.batch_size // trainer_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Trainer count: %d" % trainer_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) with fluid.dygraph.guard(place): if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() cls_model = ClsModelLayer( args, bert_config, num_labels, is_training=True, return_pooled_out=True) optimizer = Optimizer( warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, model_cls=cls_model, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, loss_scaling=args.loss_scaling, parameter_list=cls_model.parameters()) if args.init_pretraining_params: print("Load pre-trained model from %s" % args.init_pretraining_params) init_from_static_model(args.init_pretraining_params, cls_model, bert_config) if args.use_data_parallel: cls_model = fluid.dygraph.parallel.DataParallel(cls_model, strategy) train_data_generator = fluid.contrib.reader.distributed_batch_reader(train_data_generator) steps = 0 time_begin = time.time() for batch in train_data_generator(): data_ids = create_data(batch) loss, accuracy, num_seqs = cls_model(data_ids) optimizer.optimization(loss, use_data_parallel = args.use_data_parallel, model = cls_model) cls_model.clear_gradients() if steps != 0 and steps % args.skip_steps == 0: time_end = time.time() used_time = time_end - time_begin current_example, current_epoch = processor.get_train_progress() localtime = time.asctime(time.localtime(time.time())) print("%s, epoch: %s, steps: %s, dy_graph loss: %f, acc: %f, speed: %f steps/s" % (localtime, current_epoch, steps, loss.numpy(), accuracy.numpy(), args.skip_steps / used_time)) time_begin = time.time() if steps != 0 and steps % args.save_steps == 0 and fluid.dygraph.parallel.Env().local_rank == 0: save_path = os.path.join(args.checkpoints, "steps" + "_" + str(steps)) fluid.save_dygraph( cls_model.state_dict(), save_path) fluid.save_dygraph( optimizer.optimizer.state_dict(), save_path) print("Save model parameters and optimizer status at %s" % save_path) steps += 1 if fluid.dygraph.parallel.Env().local_rank == 0: save_path = os.path.join(args.checkpoints, "final") fluid.save_dygraph( cls_model.state_dict(), save_path) fluid.save_dygraph( optimizer.optimizer.state_dict(), save_path) print("Save model parameters and optimizer status at %s" % save_path) return cls_model
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') processor = MnliDataProcessor( data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, random_seed=args.random_seed ) num_labes = len(processor.get_labels()) if args.random_seed is not None: random.seed(args.random_seed) np.random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True ) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = args.epoch * num_train_examples // args.batch_size warmup_steps = int(max_train_steps * args.warmup_proportion) classifier = Classifier(bert_config, num_labes).to(device) # optimizer = torch.optim.Adam(classifier.parameters(), lr=args.learning_rate) optimizer = Optimizer(classifier, warmup_steps, max_train_steps, args.learning_rate, args.weight_decay) if args.init_pre_training_params: pre_training_params = load_pickle(args.init_pre_training_params) classifier.bert.load_state_dict(pre_training_params) # Temporal cls_ckp = load_pickle('/home/cvds_lab/maxim/transformer_investigation/notebooks/ckp/classifier_ckp.pkl') classifier.cls_out.weight.data = torch.tensor(cls_ckp['cls_out_w'], dtype=torch.float32).t().to(device) classifier.cls_out.bias.data = torch.tensor(cls_ckp['cls_out_b'], dtype=torch.float32).to(device) # Temporal logfile = args.log_to Logger().add_log(logfile, ['epoch', 'step', 'loss', 'accuracy', 'cls_w_mean', 'cls_w_std', 'cls_w_min', 'cls_w_max', 'cls_b_mean', 'cls_b_std', 'cls_b_min', 'cls_b_max']) steps = 0 total_loss, total_acc = [], [] time_begin = time.time() for batch in train_data_generator(): steps += 1 src_ids = torch.tensor(batch[0], dtype=torch.long).to(device) position_ids = torch.tensor(batch[1], dtype=torch.long).to(device) sentence_ids = torch.tensor(batch[2], dtype=torch.long).to(device) input_mask = torch.tensor(batch[3], dtype=torch.float32).to(device) labels = torch.tensor(batch[4], dtype=torch.long).to(device) optimizer.zero_grad() loss, _, accuracy = classifier(src_ids, position_ids, sentence_ids, input_mask, labels) loss.backward() optimizer.step(steps) current_example, current_epoch = processor.get_train_progress() Logger()[logfile]['epoch'].append(current_epoch) Logger()[logfile]['step'].append(steps) Logger()[logfile]['loss'].append(loss.item()) Logger()[logfile]['accuracy'].append(accuracy.item()) with torch.no_grad(): Logger()[logfile]['cls_w_mean'].append(classifier.cls_out.weight.mean().item()) Logger()[logfile]['cls_w_std'].append(classifier.cls_out.weight.std().item()) Logger()[logfile]['cls_w_min'].append(classifier.cls_out.weight.min().item()) Logger()[logfile]['cls_w_max'].append(classifier.cls_out.weight.max().item()) Logger()[logfile]['cls_b_mean'].append(classifier.cls_out.bias.mean().item()) Logger()[logfile]['cls_b_std'].append(classifier.cls_out.bias.std().item()) Logger()[logfile]['cls_b_min'].append(classifier.cls_out.bias.min().item()) Logger()[logfile]['cls_b_max'].append(classifier.cls_out.bias.max().item()) if steps % 1000 == 0: Logger().log_all() if steps % args.skip_steps == 0: total_loss.append(loss.item()) total_acc.append(accuracy.item()) current_example, current_epoch = processor.get_train_progress() time_end = time.time() used_time = time_end - time_begin print('epoch: %d, progress: %d/%d, step: %d, ave loss: %f, ave acc: %f, speed: %f steps/s' % (current_epoch, current_example, num_train_examples, steps, np.mean(total_loss).item(), np.mean(total_acc).item(), args.skip_steps / used_time)) total_loss, total_acc = [], [] time_begin = time.time()
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = get_device_num() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed if args.do_train: # NOTE: If num_trainers > 1, the shuffle_seed must be set, because # the order of batch data generated by reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, shuffle=args.shuffle, shuffle_seed=shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr, loss_scaling = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.do_val: dev_prog = fluid.Program() with fluid.program_guard(dev_prog, startup_prog): with fluid.unique_name.guard(): dev_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) dev_prog = dev_prog.clone(for_test=True) dev_data_loader.set_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='dev', epoch=1, dev_count=1, shuffle=False), place) if args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) test_data_loader.set_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, dev_count=1, shuffle=False), place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if args.use_cuda and num_trainers > 1: assert shuffle_seed is not None dist_utils.prepare_for_multi_process(exe, build_strategy, train_program) train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator) train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy) train_data_loader.set_batch_generator(train_data_generator, place) if args.do_train: train_data_loader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() throughput = [] ce_info = [] total_batch_num = 0 # used for benchmark while True: try: steps += 1 total_batch_num += 1 # used for benchmark if args.max_iter and total_batch_num == args.max_iter: # used for benchmark return if steps % args.skip_steps == 0: if args.use_fp16: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name, loss_scaling.name ] else: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if args.use_fp16: np_loss, np_acc, np_lr, np_num_seqs, np_scaling = outputs else: np_loss, np_acc, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f" % np_lr[0] if args.use_fp16: verbose += ", loss scaling: %f" % np_scaling[0] print(verbose) current_example, current_epoch = processor.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin # profiler tools if args.is_profiler and current_epoch == 0 and steps == args.skip_steps: profiler.start_profiler("All") elif args.is_profiler and current_epoch == 0 and steps == args.skip_steps * 2: profiler.stop_profiler("total", args.profiler_path) return log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}, ave acc: {}".format( current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs)) ce_info.append([ np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), used_time ]) if steps > 0: throughput.append(args.skip_steps / used_time) log_record = log_record + ", speed: %f steps/s" % ( args.skip_steps / used_time) print(log_record) else: print(log_record) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if steps % args.validation_steps == 0: print("Average throughtput: %s" % (np.average(throughput))) throughput = [] # evaluate dev set if args.do_val: evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) train_data_loader.reset() break if args.enable_ce: card_num = get_cards() ce_cost = 0 ce_acc = 0 ce_time = 0 try: ce_cost = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: print("ce info error") print("kpis\ttrain_duration_%s_card%s\t%s" % (args.task_name, card_num, ce_time)) print("kpis\ttrain_cost_%s_card%s\t%f" % (args.task_name, card_num, ce_cost)) print("kpis\ttrain_acc_%s_card%s\t%f" % (args.task_name, card_num, ce_acc)) # final eval on dev set if args.do_val: print("Final validation result:") evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: print("Final test result:") evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test")
def __init__(self, args=None, detect_entities=False): if args is None: self.args = load_pickle("args.pkl") else: self.args = args self.cuda = torch.cuda.is_available() self.detect_entities = detect_entities if self.detect_entities: self.nlp = spacy.load("en_core_web_lg") else: self.nlp = None self.entities_of_interest = [ "PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "PER", ] logger.info("Loading tokenizer and model...") from .train_funcs import load_state if self.args.model_no == 0: from model.bert import BertModel as Model model = args.model_size #'bert-base-uncased' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="classification", n_classes_=self.args.num_classes, ) elif self.args.model_no == 1: from model.albert.albert import AlbertModel as Model model = args.model_size #'albert-base-v2' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="classification", n_classes_=self.args.num_classes, ) elif args.model_no == 2: # BioBert from model.bert import BertModel, BertConfig model = "bert-base-uncased" model_name = "BioBERT" config = BertConfig.from_pretrained( "./additional_models/biobert_v1.1_pubmed/bert_config.json" ) self.net = BertModel.from_pretrained( pretrained_model_name_or_path="./additional_models/biobert_v1.1_pubmed/biobert_v1.1_pubmed.bin", config=config, force_download=False, model_size="bert-base-uncased", task="classification", n_classes_=self.args.num_classes, ) self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name) self.net.resize_token_embeddings(len(self.tokenizer)) if self.cuda: self.net.cuda() start_epoch, best_pred, amp_checkpoint = load_state( self.net, None, None, self.args, load_best=False ) logger.info("Done!") self.e1_id = self.tokenizer.convert_tokens_to_ids("[E1]") self.e2_id = self.tokenizer.convert_tokens_to_ids("[E2]") self.pad_id = self.tokenizer.pad_token_id self.rm = load_pickle("relations.pkl")
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor( vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length, adv_text_path=args.adv_text_path) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( pyreader_name='train_reader', bert_config=bert_config, is_training=True) scheduled_lr = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( pyreader_name='test_reader', bert_config=bert_config, is_training=False) fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name, start_logits.name, end_logits.name, num_seqs.name]) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() best_f1 = -1 while steps < max_train_steps: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if (steps % args.save_steps == 0 or steps == max_train_steps) and steps > int(max_train_steps/3.0): #if (steps % args.save_steps == 0 or steps == max_train_steps): if args.do_predict: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) adv_f1 = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor) # print(adv_f1) # continue # if steps != max_train_steps: if adv_f1 > best_f1: best_f1 = adv_f1 save_path = os.path.join(args.checkpoints, "step_best") print("best adv model saved") # else: # save_path = os.path.join(args.checkpoints, # "step_last") fluid.io.save_persistables(exe, save_path, train_program) test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file.replace("dev", "test"), batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) test_f1 = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, args.predict_file.replace("dev", "test")) print("This is the test score.") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict and not args.do_train: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor)
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict or args.do_val): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) wn_id2concept, wn_concept2id, wn_concept_embedding_mat = read_concept_embedding( args.wn_concept_embedding_path) nell_id2concept, nell_concept2id, nell_concept_embedding_mat = read_concept_embedding( args.nell_concept_embedding_path) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed random.seed(args.random_seed) np.random.seed(args.random_seed) if args.do_train: train_concept_settings = { 'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/train.tokenization.{}.data' .format('uncased' if args.do_lower_case else 'cased'), 'wn_concept2id': wn_concept2id, 'nell_concept2id': nell_concept2id, 'use_wordnet': args.use_wordnet, 'retrieved_synset_path': args.retrieved_synset_path, 'use_nell': args.use_nell, 'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path, } train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch, **train_concept_settings) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) logger.info("Device count: %d" % dev_count) logger.info("Num train examples: %d" % num_train_examples) logger.info("Max train steps: %d" % max_train_steps) logger.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() # if args.random_seed is not None: # train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( pyreader_name='train_reader', bert_config=bert_config, max_wn_concept_length=processor. train_wn_max_concept_length, max_nell_concept_length=processor. train_nell_max_concept_length, wn_concept_embedding_mat=wn_concept_embedding_mat, nell_concept_embedding_mat=nell_concept_embedding_mat, is_training=True, freeze=args.freeze) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) if args.use_ema: ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) ema.update() fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) logger.info( "Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict or args.do_val: eval_concept_settings = { 'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/dev.tokenization.{}.data' .format('uncased' if args.do_lower_case else 'cased'), 'wn_concept2id': wn_concept2id, 'nell_concept2id': nell_concept2id, 'use_wordnet': args.use_wordnet, 'retrieved_synset_path': args.retrieved_synset_path, 'use_nell': args.use_nell, 'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path, } eval_data_generator = processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1, **eval_concept_settings) test_prog = fluid.Program() # if args.random_seed is not None: # test_prog.random_seed = args.random_seed with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( pyreader_name='test_reader', bert_config=bert_config, max_wn_concept_length=processor. predict_wn_max_concept_length, max_nell_concept_length=processor. predict_nell_max_concept_length, wn_concept_embedding_mat=wn_concept_embedding_mat, nell_concept_embedding_mat=nell_concept_embedding_mat, is_training=False) if args.use_ema and 'ema' not in dir(): ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) fluid.memory_optimize(test_prog, skip_opt_set=[ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ]) test_prog = test_prog.clone(for_test=True) # if args.random_seed is not None: # test_prog.random_seed = args.random_seed exe.run(startup_prog) if args.do_train: logger.info('load pretrained concept embedding') fluid.global_scope().find_var('wn_concept_emb_mat').get_tensor().set( wn_concept_embedding_mat, place) fluid.global_scope().find_var('nell_concept_emb_mat').get_tensor().set( nell_concept_embedding_mat, place) if args.init_checkpoint and args.init_pretraining_params: logger.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict or args.do_val: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while steps < max_train_steps: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) logger.info(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() logger.info( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0 or steps == max_train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0 or steps == max_train_steps: if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1, **eval_concept_settings)) val_performance = predict( exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps)) logger.info( "Validation performance after step {}:\n* Exact_match: {}\n* F1: {}" .format(steps, val_performance['exact_match'], val_performance['f1'])) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict: test_pyreader.decorate_tensor_provider(eval_data_generator) if args.use_ema: with ema.apply(exe): eval_performance = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings) else: eval_performance = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings) logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format( eval_performance['exact_match'], eval_performance['f1']))
def __init__(self, args=None): if args is None: self.args = load_pickle("args.pkl") else: self.args = args self.cuda = torch.cuda.is_available() if self.args.model_no == 0: from model.bert import BertModel as Model from model.bert_tokenizer import BertTokenizer as Tokenizer model = args.model_size #'bert-large-uncased' 'bert-base-uncased' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="fewrel", ) elif self.args.model_no == 1: from model.albert.albert import AlbertModel as Model from model.albert.albert_tokenizer import ( AlbertTokenizer as Tokenizer, ) model = args.model_size #'albert-base-v2' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="fewrel", ) elif args.model_no == 2: # BioBert from model.bert import BertModel, BertConfig from model.bert_tokenizer import BertTokenizer as Tokenizer model = "bert-base-uncased" model_name = "BioBERT" config = BertConfig.from_pretrained( "./additional_models/biobert_v1.1_pubmed/bert_config.json" ) self.net = BertModel.from_pretrained( pretrained_model_name_or_path="./additional_models/biobert_v1.1_pubmed/biobert_v1.1_pubmed.bin", config=config, force_download=False, model_size="bert-base-uncased", task="fewrel", ) if os.path.isfile("./data/%s_tokenizer.pkl" % model_name): self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name) logger.info("Loaded tokenizer from saved file.") else: logger.info( "Saved tokenizer not found, initializing new tokenizer..." ) if args.model_no == 2: self.tokenizer = Tokenizer( vocab_file="./additional_models/biobert_v1.1_pubmed/vocab.txt", do_lower_case=False, ) else: self.tokenizer = Tokenizer.from_pretrained( model, do_lower_case=False ) self.tokenizer.add_tokens( ["[E1]", "[/E1]", "[E2]", "[/E2]", "[BLANK]"] ) save_as_pickle("%s_tokenizer.pkl" % model_name, self.tokenizer) logger.info( "Saved %s tokenizer at ./data/%s_tokenizer.pkl" % (model_name, model_name) ) self.net.resize_token_embeddings(len(self.tokenizer)) self.pad_id = self.tokenizer.pad_token_id if self.cuda: self.net.cuda() if self.args.use_pretrained_blanks == 1: logger.info( "Loading model pre-trained on blanks at ./data/test_checkpoint_%d.pth.tar..." % args.model_no ) checkpoint_path = ( "./data/test_checkpoint_%d.pth.tar" % self.args.model_no ) checkpoint = torch.load(checkpoint_path) model_dict = self.net.state_dict() pretrained_dict = { k: v for k, v in checkpoint["state_dict"].items() if k in model_dict.keys() } model_dict.update(pretrained_dict) self.net.load_state_dict(pretrained_dict, strict=False) del checkpoint, pretrained_dict, model_dict logger.info("Loading Fewrel dataloaders...") self.train_loader, _, self.train_length, _ = load_dataloaders(args)
def net(self): args = self.p_args() bert_config = BertConfig("uncased_L-24_H-1024_A-16/bert_config.json") bert_config.print_config() place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = 1 if args.do_train: my_dist_env = dist_env() worker_endpoints_env = my_dist_env["trainer_endpoints"] worker_endpoints = worker_endpoints_env.split(",") current_endpoint = my_dist_env["current_endpoint"] trainer_id = worker_endpoints.index(current_endpoint) # new rolemaker here print("current_id: ", trainer_id) print("worker_endpoints: ", worker_endpoints) role = role_maker.UserDefinedCollectiveRoleMaker( current_id=trainer_id, worker_endpoints=worker_endpoints) # Fleet get role of each worker fleet.init(role) exe = fluid.Executor(place) # init program train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed != 0: print("set program random seed as: ", args.random_seed) startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) dev_count = len(worker_endpoints) # we need to keep every trainer of fleet the same shuffle_seed print("shuffle_seed: ", args.shuffle_seed) self.train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, dev_idx=0, shuffle=args.shuffle, shuffle_seed=args.shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = 5 self.warmup_steps = int(5 * 0.1) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 3 dist_strategy.use_hierarchical_allreduce = True #dist_strategy.mode = "collective" #dist_strategy.collective_mode = "grad_allreduce" with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=self.loss, warmup_steps=self.warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=False, loss_scaling=args.loss_scaling, dist_strategy=dist_strategy) exe.run(startup_prog) with open("__model__", "wb") as f: f.write(fleet._origin_program.desc.serialize_to_string()) with open("debug_program", "w") as f: f.write(str(fleet._origin_program)) return self.loss
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, probs, accuracy, num_seqs = create_model( args, pyreader_name='train_reader', bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ loss.name, probs.name, accuracy.name, num_seqs.name ]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, loss, probs, accuracy, num_seqs = create_model( args, pyreader_name='test_reader', bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_acc, np_num_seqs = outputs else: np_loss, np_acc, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = processor.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator(batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) print("Final validation result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) print("Final test result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test")
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, num_seqs = create_model( bert_config=bert_config, is_training=True) scheduled_lr, loss_scaling = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, unique_ids, start_logits, end_logits, num_seqs = create_model( bert_config=bert_config, is_training=False) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, exec_strategy=exec_strategy) train_data_loader.set_batch_generator(train_data_generator, place) train_data_loader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while steps < max_train_steps: try: steps += 1 if steps % args.skip_steps == 0: if args.use_fp16: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name, loss_scaling.name ] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if args.use_fp16: np_loss, np_lr, np_num_seqs, np_scaling = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f " % np_lr[0] if args.use_fp16: verbose += ", loss scaling: %f" % np_scaling[0] print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0 or steps == max_train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.save(program=train_program, model_path=save_path) train_data_loader.reset() break if args.do_predict: test_data_loader.set_batch_generator( processor.data_generator(data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1), place) predict(exe, test_prog, test_data_loader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor)
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=False) num_labels = len(processor.get_labels()) predict_prog = fluid.Program() predict_startup = fluid.Program() with fluid.program_guard(predict_prog, predict_startup): with fluid.unique_name.guard(): predict_pyreader, probs, feed_target_names = create_model( args, bert_config=bert_config, num_labels=num_labels, is_prediction=True) predict_prog = predict_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(predict_startup) if args.init_checkpoint: init_pretraining_params(exe, args.init_checkpoint, predict_prog, args.use_fp16) else: raise ValueError( "args 'init_checkpoint' should be set for prediction!") # Due to the design that ParallelExecutor would drop small batches (mostly the last batch) # So using ParallelExecutor may left some data unpredicted # if prediction of each and every example is needed, please use Executor instead predict_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=predict_prog) predict_pyreader.decorate_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) predict_pyreader.start() all_results = [] time_begin = time.time() while True: try: results = predict_exe.run(fetch_list=[probs.name]) all_results.extend(results[0]) except fluid.core.EOFException: predict_pyreader.reset() break time_end = time.time() np.set_printoptions(precision=4, suppress=True) print("-------------- prediction results --------------") print("example_id\t" + ' '.join(processor.get_labels())) for index, result in enumerate(all_results): print(str(index) + '\t{}'.format(result)) if args.save_inference_model_path: _, ckpt_dir = os.path.split(args.init_checkpoint.rstrip('/')) dir_name = ckpt_dir + '_inference_model' model_path = os.path.join(args.save_inference_model_path, dir_name) print("save inference model to %s" % model_path) fluid.io.save_inference_model(model_path, feed_target_names, [probs], exe, main_program=predict_prog)
def train(args): print("pretraining start") bert_config = BertConfig(args.bert_config_path) bert_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='train_reader', bert_config=bert_config) scheduled_lr = optimization(loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='test_reader', bert_config=bert_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = DataReader(data_dir=args.data_dir, batch_size=args.batch_size, in_tokens=args.in_tokens, vocab_path=args.vocab_path, voc_size=bert_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=total_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) if args.validation_set_dir and args.validation_set_dir != "": predict = predict_wrapper(args, exe, bert_config, test_prog=test_prog, pyreader=test_pyreader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_pyreader.decorate_tensor_provider(data_reader.data_generator()) train_pyreader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += nccl2_num_trainers skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: train_exe.run(fetch_list=[]) continue if steps % skip_steps != 0: train_exe.run(fetch_list=[]) else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ]) acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) print("feed_queue size", train_pyreader.queue.size()) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file = data_reader.get_progress( ) print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if args.validation_set_dir and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_pyreader.reset() break
from collections import defaultdict # import pickle from tensorflow.contrib import learn from sklearn import metrics import jieba from hanziconv import HanziConv # from evaluation import * import code ' ' # Parameters # ================================================== # Data loading params # config_parm = RNNConfig() # config_parm = ATTConfig() config_parm = BertConfig() # Load data #数据准备,加载数据 print("Loading data...") # x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) x_text, y, y_onehot = data_helpers.load_data_and_labels(config_parm, is_training=True) #建立词汇 max_document_length = max([len(x.split(" ")) for x in x_text])#计算最长的词汇长度 # print("最大长度:", max_document_length) words = " ".join(x_text).split() vocab_size = len(list(set(" ".join(x_text).split()))) data, count, dictionary, rev_dictionary = data_helpers.build_dataset(words, vocab_size) GO = dictionary['GO']
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: build_strategy = fluid.BuildStrategy() print("estimating runtime number of examples...") num_train_examples = processor.estimate_runtime_examples( args.train_file, sample_rate=args.sample_rate) print("runtime number of examples:") print(num_train_examples) train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, max_len=args.max_seq_len, phase='train', shuffle=True, dev_count=dev_count, with_negative=args.with_negative, epoch=args.epoch) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( pyreader_name='train_reader', bert_config=bert_config, is_training=True) train_pyreader.decorate_tensor_provider(train_data_generator) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) loss.persistable = True num_seqs.persistable = True ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay) ema.update() train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict: build_strategy = fluid.BuildStrategy() test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( pyreader_name='test_reader', bert_config=bert_config, is_training=False) if 'ema' not in dir(): ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) unique_ids.persistable = True start_logits.persistable = True end_logits.persistable = True num_seqs.persistable = True test_prog = test_prog.clone(for_test=True) test_compiled_program = fluid.CompiledProgram( test_prog).with_data_parallel(build_strategy=build_strategy) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps == max_train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) break except Exception as err: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict: test_pyreader.decorate_tensor_provider( processor.data_generator(data_path=args.predict_file, batch_size=args.batch_size, max_len=args.max_seq_len, phase='predict', shuffle=False, dev_count=dev_count, epoch=1)) if args.use_ema: with ema.apply(exe): predict(exe, test_compiled_program, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, prefix='ema_') else: predict(exe, test_compiled_program, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor)