def eval_all(): # output_model_file = "../../output/best_model" output_model_file = MODEL_PATH output_config_file = os.path.join('../model_dir/', args.config_name) config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) if not args.no_pai: try: model.load_state_dict(torch.load(output_model_file))#, map_location='cpu')) except: model = nn.DataParallel(model) model.load_state_dict(torch.load(output_model_file))#, map_location='cpu')) else: try: model.load_state_dict(torch.load(output_model_file, map_location='cpu')) except: model = nn.DataParallel(model) model.load_state_dict(torch.load(output_model_file, map_location='cpu')) result_file_path = os.path.join('../metric', args.result_file_name) evaluate(model, result_file=result_file_path) if not args.no_pai: print(os.getcwd()) pai_file_output = "/Container/thsi_yicui/dureader-bert/Dureader/output" client.upload(pai_file_output, result_file_path, overwrite=True)
def eval_all(): output_model_file = "../model_dir/best_model" output_config_file = "../model_dir/bert_config.json" config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(output_model_file)) #, map_location='cpu')) evaluate(model.cpu(), result_file="../metric/predicts.json")
def __init__(self, model_path, lower_case=True): self.model_path = model_path self.tokenizer = BertTokenizer.from_pretrained( model_path, do_lower_case=lower_case) self.model = BertForQuestionAnswering.from_pretrained(model_path) #self.model.cuda() self.model.eval()
def __init__(self, model_path, lower_case = True): self.model_path = model_path self.tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=lower_case) self.model = BertForQuestionAnswering.from_pretrained(model_path) self.device = torch.device("cuda") self.model.to(self.device) self.model.eval()
def eval_all(): # output_model_file = "../../output/best_model" output_model_file = MODEL_PATH output_config_file = CONFIG_PATH config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) if next(model.parameters()).is_cuda: try: model.load_state_dict(torch.load(output_model_file)) except: model = nn.DataParallel(model) model.load_state_dict(torch.load(output_model_file)) else: try: model.load_state_dict( torch.load(output_model_file, map_location='cpu')) except: model = nn.DataParallel(model) model.load_state_dict( torch.load(output_model_file, map_location='cpu')) result_file_path = os.path.join('../metric', args.result_file_name) evaluate(model, result_file=result_file_path)
def initialize_model(args): ''' return model, ready to trace ''' config = BertConfig.from_json_file(args.config_file) if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) model = BertForQuestionAnswering(config) model.enable_apex(False) state_dict = torch.load(args.checkpoint, map_location='cpu')["model"] model.load_state_dict(state_dict) if args.fp16: model.half() return model
def eval_all(): output_model_file = "../model_dir/best_model" output_config_file = "../model_dir/bert_configbase.json" config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) # 针对多卡训练加载模型的方法: state_dict = torch.load(output_model_file, map_location='cuda:0') # 初始化一个空 dict new_state_dict = OrderedDict() # 修改 key,没有module字段则需要不上,如果有,则需要修改为 module.features for k, v in state_dict.items(): if 'module' not in k: k = k else: k = k.replace('module.', '') new_state_dict[k] = v model.load_state_dict(new_state_dict) # model.load_state_dict(torch.load(output_model_file)) #, map_location='cpu')) evaluate(model.cpu(), result_file="../metric/predicts_dev.json")
def get_predictor_model(cls): config = BertConfig.from_json_file(config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(MODEL_PATH, map_location='cpu')["model"]) model.to(device) cls.model = model return cls.model
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--init_checkpoint", default=None, type=str, help="Init from checkpoint") parser.add_argument("--init_full_model", default=None, type=str, help="Initial full model") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=2000, type=int, help="How often to save the model checkpoint") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument("--n_best_size", default=3, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=100, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--use_history", default=False, action='store_true', help="Use history features") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=1, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') parser.add_argument('--dry_run', action='store_true', default=False, help='Don\'t load model, just load data') args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Warning: output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = bdu.read_coqa_examples( input_file=args.train_file, is_training=True) real_train_example_len = sum(len(ex['questions']) for ex in train_examples) num_train_steps = int( real_train_example_len / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if not args.dry_run: if args.init_full_model is not None: model_state_dict = torch.load(args.init_full_model) model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) else: model = BertForQuestionAnswering(bert_config, use_history=args.use_history) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = bdu.convert_coqa_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", real_train_example_len) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_f_history = torch.tensor([f.f_history for f in train_features], dtype=torch.uint8) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_f_history) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"): running_loss = [] for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, f_history = batch # Convert to float here. f_history_32 = f_history.float() loss = model(input_ids, segment_ids, input_mask, start_positions=start_positions, end_positions=end_positions, f_history=f_history_32, debug=True) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps running_loss.append(loss.item()) if step % 40 == 0: logger.info("epoch {} step {}: avg loss {}".format(epoch_i, step, sum(running_loss) / len(running_loss))) running_loss = [] loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if global_step % (args.save_checkpoints_steps // args.train_batch_size) == 0: model_name = os.path.join(args.output_dir, 'model-{}.pth'.format(global_step)) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self print("Step {}: saving model to {}".format(global_step, model_name)) torch.save(model_to_save.state_dict(), model_name) model_name = os.path.join(args.output_dir, 'model-{}.pth'.format(global_step)) print("Step {}: saving model to {}".format(global_step, model_name)) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), model_name) if args.do_predict: eval_examples = bdu.read_coqa_examples( input_file=args.predict_file, is_training=False) eval_features = bdu.convert_coqa_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) all_f_history = torch.tensor([f.f_history for f in eval_features], dtype=torch.uint8) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_f_history) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices, f_history in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) f_history = f_history.to(device) with torch.no_grad(),: f_history_32 = f_history.float() batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask, f_history=f_history_32) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, coqa_id=eval_feature.coqa_id, turn_id=eval_feature.turn_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") bdu.write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default="bert-base-cased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default="/Users/lifuh/Documents/Research/squad2.0/output/", type=str, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default="/Users/lifuh/Documents/Research/squad2.0/train-v2.0.json", type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default="/Users/lifuh/Documents/Research/squad2.0/dev-v2.0.json", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) # initialize a tokenizer from a pretrained model tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_is_impossibles = torch.tensor([int(f.is_impossible) for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_is_impossibles) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, _ = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, True, args.null_score_diff_threshold)
def function_main_bert(q_id, context_ans, question): bert_config_file = 'bert_config.json' vocab_file = 'vocab.txt' # output_dir='output' # processed_data = 'processed' #predict_file='ASQdev.json' finetuned_checkpoint = 'ft_model_bert.bin' max_seq_length = 500 do_lower_case = True local_rank = -1 seed = 42 n_best_size = 20 predict_batch_size = 8 max_answer_length = 500 max_query_length = 64 doc_stride = 128 max_seq_length = 500 final_answer = "" probs = 0.0 no_cuda = False if local_rank == -1 or no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) eval_examples = read_squad_examples(id=q_id, paragraph=context_ans, question=question, tokenizer=tokenizer) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length) model = BertForQuestionAnswering(bert_config) state_dict = torch.load(finetuned_checkpoint, map_location='cpu') new_state_dict = collections.OrderedDict() for key, value in state_dict.items(): new_state_dict[key[:]] = value model.load_state_dict(new_state_dict) del state_dict del new_state_dict model.to(device) if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) final_answer, probs, start_logits, end_logits, eval_feature = run_evaluate( local_rank, predict_batch_size, n_best_size, max_answer_length, do_lower_case, model, eval_features, device, eval_examples, tokenizer) return final_answer, probs, start_logits, end_logits, eval_feature
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " ) parser.add_argument("--seed", default=1, type=int) parser.add_argument( "--question", default= "Most antibiotics target bacteria and don't affect what class of organisms? ", type=str, help="question") parser.add_argument( "--context", default= "Within the genitourinary and gastrointestinal tracts, commensal flora serve as biological barriers by competing with pathogenic bacteria for food and space and, in some cases, by changing the conditions in their environment, such as pH or available iron. This reduces the probability that pathogens will reach sufficient numbers to cause illness. However, since most antibiotics non-specifically target bacteria and do not affect fungi, oral antibiotics can lead to an overgrowth of fungi and cause conditions such as a vaginal candidiasis (a yeast infection). There is good evidence that re-introduction of probiotic flora, such as pure cultures of the lactobacilli normally found in unpasteurized yogurt, helps restore a healthy balance of microbial populations in intestinal infections in children and encouraging preliminary data in studies on bacterial gastroenteritis, inflammatory bowel diseases, urinary tract infection and post-surgical infections. ", type=str, help="context") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument( "--n_best_size", default=1, type=int, help="The total number of n-best predictions to generate. ") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( '--version_2_with_negative', action='store_true', help='If true, then the model can reply with "unknown". ') parser.add_argument( '--null_score_diff_threshold', type=float, default=-11.0, help= "If null_score - best_non_null is greater than the threshold predict 'unknown'. " ) parser.add_argument( '--vocab_file', type=str, default=None, required=True, help="Vocabulary mapping/file BERT was pretrainined on") parser.add_argument("--config_file", default=None, type=str, required=True, help="The BERT model config") parser.add_argument('--fp16', action='store_true', help="use mixed-precision") parser.add_argument("--local_rank", default=-1, help="ordinal of the GPU to use") args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large # Prepare model config = BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # initialize model model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')["model"]) model.to(device) if args.fp16: model.half() model.eval() print("question: ", args.question) print("context: ", args.context) print() # preprocessing doc_tokens = args.context.split() query_tokens = tokenizer.tokenize(args.question) feature = preprocess_tokenized_text(doc_tokens, query_tokens, tokenizer, max_seq_length=args.max_seq_length, max_query_length=args.max_query_length) tensors_for_inference, tokens_for_postprocessing = feature input_ids = torch.tensor(tensors_for_inference.input_ids, dtype=torch.long).unsqueeze(0) segment_ids = torch.tensor(tensors_for_inference.segment_ids, dtype=torch.long).unsqueeze(0) input_mask = torch.tensor(tensors_for_inference.input_mask, dtype=torch.long).unsqueeze(0) # load tensors to device input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) # run prediction with torch.no_grad(): start_logits, end_logits = model(input_ids, segment_ids, input_mask) # post-processing start_logits = start_logits[0].detach().cpu().tolist() end_logits = end_logits[0].detach().cpu().tolist() answer, answers = get_answer(doc_tokens, tokens_for_postprocessing, start_logits, end_logits, args) # print result print() print(answer) print() print(json.dumps(answers, indent=4))
def main(): parser = argparse.ArgumentParser() BERT_DIR = "uncased_L-12_H-768_A-12/" ## Required parameters parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \ type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \ help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default="out", type=str, \ help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--load", default=False, action='store_true') parser.add_argument("--train_file", type=str, \ help="SQuAD json for training. E.g., train-v1.1.json", \ default="/home/sewon/data/squad/train-v1.1.json") parser.add_argument("--predict_file", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \ default="/home/sewon/data/squad/dev-v1.1.json") parser.add_argument("--init_checkpoint", type=str, help="Initial checkpoint (usually from a pre-trained BERT model).", \ default=BERT_DIR+"pytorch_model.bin") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=300, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=39, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=300, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=1000.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=3, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--eval_period', type=int, default=500) parser.add_argument('--max_n_answers', type=int, default=20) parser.add_argument('--n_paragraphs', type=str, default='40') parser.add_argument('--verbose', action="store_true", default=False) parser.add_argument('--wait_step', type=int, default=12) # Learning method variation parser.add_argument('--loss_type', type=str, default="mml") parser.add_argument('--tau', type=float, default=12000.0) # For evaluation parser.add_argument('--prefix', type=str, default="") #500 parser.add_argument('--debug', action="store_true", default=False) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, handlers=[ logging.FileHandler(os.path.join(args.output_dir, "log.txt")), logging.StreamHandler() ]) logger = logging.getLogger(__name__) logger.info(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if not args.predict_file: raise ValueError( "If `do_train` is True, then `predict_file` must be specified." ) if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.do_train and args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) model = BertForQuestionAnswering(bert_config, device, 4, loss_type=args.loss_type, tau=args.tau) metric_name = "EM" tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_split = ',' in args.train_file if train_split: n_train_files = len(args.train_file.split(',')) eval_dataloader, eval_examples, eval_features, _ = get_dataloader( logger=logger, args=args, input_file=args.predict_file, is_training=False, batch_size=args.predict_batch_size, num_epochs=1, tokenizer=tokenizer) if args.do_train: train_file = args.train_file if train_split: train_file = args.train_file.split(',')[0] train_dataloader, _, _, num_train_steps = get_dataloader( logger=logger, args=args, \ input_file=train_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer) if args.init_checkpoint is not None: logger.info("Loading from {}".format(args.init_checkpoint)) state_dict = torch.load(args.init_checkpoint, map_location='cpu') if args.do_train and args.init_checkpoint.endswith( 'pytorch_model.bin'): model.bert.load_state_dict(state_dict) else: filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model.load_state_dict(state_dict) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 best_f1 = (-1, -1) wait_step = 0 model.train() global_step = 0 stop_training = False train_losses = [] for epoch in range(int(args.num_train_epochs)): if epoch > 0 and train_split: train_file = args.train_file.split(',')[epoch % n_train_files] train_dataloader = get_dataloader( logger=logger, args=args, \ input_file=train_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer)[0] for step, batch in enumerate(train_dataloader): global_step += 1 batch = [t.to(device) for t in batch] loss = model(batch, global_step) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps train_losses.append(loss.detach().cpu()) loss.backward() if global_step % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() if global_step % args.eval_period == 0: model.eval() f1 = predict(logger, args, model, eval_dataloader, eval_examples, eval_features, \ device, write_prediction=False) logger.info( "Step %d Train loss %.2f EM %.2f F1 %.2f on epoch=%d" % (global_step, np.mean(train_losses), f1[0] * 100, f1[1] * 100, epoch)) train_losses = [] if best_f1 < f1: logger.info("Saving model with best %s: %.2f (F1 %.2f) -> %.2f (F1 %.2f) on epoch=%d" % \ (metric_name, best_f1[0]*100, best_f1[1]*100, f1[0]*100, f1[1]*100, epoch)) model_state_dict = { k: v.cpu() for (k, v) in model.state_dict().items() } torch.save( model_state_dict, os.path.join(args.output_dir, "best-model.pt")) model = model.to(device) best_f1 = f1 wait_step = 0 stop_training = False else: wait_step += 1 if wait_step == args.wait_step: stop_training = True model.train() if stop_training: break logger.info("Training finished!") elif args.do_predict: if type(model) == list: model = [m.eval() for m in model] else: model.eval() f1 = predict(logger, args, model, eval_dataloader, eval_examples, eval_features, device, varying_n_paragraphs=len(args.n_paragraphs) > 1)
def main(): parser = argparse.ArgumentParser() # # 必要参数 parser.add_argument('--task', default='multi', type=str, help='Task affecting load data and vectorize feature') parser.add_argument( '--loss_type', default='double', type=str, help='Select loss double or single, only for multi task' ) # 针对multi任务才有效 parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help= "Bert pre-trained model selected in the list: bert-base-uncased,bert-large-uncased, " "bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-chinese," "bert-base-multilingual-cased.") # 选择预训练模型参数 parser.add_argument("--debug", default=False, help="Whether run on small dataset") # 正常情况下都应该选择false parser.add_argument( "--output_dir", default="./SQuAD/output/", type=str, help= "The output directory where the model checkpoints and predictions will be written." ) # # 其他参数 parser.add_argument("--train_file", default="./SQuAD/version/train.json", type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default="./SQuAD/version/prediction.json", type=str, help= "SQuAD json for predictio ns. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will be " "truncated to this length.") # # 控制参数 parser.add_argument("--do_train", default=True, help="Whether to run training.") parser.add_argument("--do_predict", default=True, help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=18, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=18, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json file." ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated.This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, help= "If true, all of the warnings related to data processing will be printed.A number of " "warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.Positive power of 2: static loss scaling value.\n" ) parser.add_argument( '--version_2_with_negative', default=False, help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() # if是采用单机形式,else采用的是分布式形式;因为我们没有分布式系统,所以采用单机多GPU的方式进行训练10.24 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='hierarchical_copy') # 以下三句话的意义不是很大,基本操作这一部分是日志的输出形式10.24 logging.basicConfig( format='%(asctime)s-%(levelname)s-%(name)s-%(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device:{}, n_gpu:{}, distributed training:{}, 16-bits training:{}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # 以下几行均是用来设置参数10.24 args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) # 设置随机种子 np.random.seed(args.seed) # 设置随机种子 torch.manual_seed(args.seed) # 为CPU设置种子用于生成随机数,以使得结果是确定的 if n_gpu > 0: # 如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子 torch.cuda.manual_seed_all(args.seed) # 以下三句又是基本操作,意义不大10.24 if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) # 以下2句是用来判断output_dir是否存在,若不存在,则创建即可(感觉有这个东西反而不太好,因为需要空文件夹)10.24 # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # 这个东西是用来干啥的(从tokenization中读取,对Tokenizer进行初始化操作)10.24 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # 从data中读取数据的方式,一种是单队列的读取方式,另一种是多通道读取方式10.24 if args.task == 'squad': read_examples = read_squad_examples elif args.task == 'multi': read_examples = read_multi_examples # 用来加载训练样例以及优化的步骤10.24 train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) if args.debug: train_examples = train_examples[:100] num_train_optimization_steps = \ int(len(train_examples)/args.train_batch_size/args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # 模型准备中ing10.24 model = BertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) # model = torch.nn.DataParallel(model).cuda() # 判断是否使用float16编码10.24 if args.fp16: # model.half().cuda() model.half() # 将模型加载到相应的CPU或者GPU中10.24 model.to(device) # 配置优化器等函数10.24 if args.do_train: param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from apex.fp16_utils import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=True) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # 进行模型的拟合训练10.24 global_step = 0 if args.do_train: # 训练语料的特征提取 train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) all_start_vector = torch.tensor( [f.start_vector for f in train_features], dtype=torch.float) all_end_vector = torch.tensor([f.end_vector for f in train_features], dtype=torch.float) all_content_vector = torch.tensor( [f.content_vector for f in train_features], dtype=torch.float) # # 替换的内容all_start_positions以及all_end_positions # all1_start_positions = [] # for i in range(len(train_features)): # for j in range(len(train_features[i].start_position)): # all1_start_positions.append(train_features[i].start_position[j]) # all_start_positions = torch.tensor([k for k in all1_start_positions], dtype=torch.long) # all1_end_positions = [] # for i in range(len(train_features)): # for j in range(len(train_features[i].end_position)): # all1_end_positions.append(train_features[i].end_position[j]) # all_end_positions = torch.tensor([k for k in all1_end_positions], dtype=torch.long) # #################################################################### train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_start_vector, all_end_vector, all_content_vector) if args.local_rank == -1: train_sampler = RandomSampler(train_data) # 随机采样器 else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in trange(int(args.num_train_epochs), desc="Epoch"): # 每次都叫他进行分发,这样的话,就可以进行多GPU训练 model = torch.nn.DataParallel(model).cuda() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, start_vector, end_vector, content_vector = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, start_vector, end_vector, content_vector, args.loss_type) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. print("loss率为:{}".format(loss)) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print("\n") print(ep) output_model_file = os.path.join(args.output_dir, str(ep) + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, str(ep) + CONFIG_NAME) torch.save(model.state_dict(), output_model_file) if isinstance(model, torch.nn.DataParallel): model = model.module model.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # 这个是用来加载进行微调调好后的代码以方便进行预测10.25 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) # 再次将GPU加入10.25 model.to(device) # 这部分就是进行相应的预测(用于生成预测文件) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = \ read_examples(input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) if args.debug: eval_examples = eval_examples[:100] eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) middle_result = os.path.join(args.output_dir, 'middle_result.pkl') pickle.dump([eval_examples, eval_features, all_results], open(middle_result, 'wb')) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") if (args.loss_type == 'double'): write_predictions_couple_labeling( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) elif (args.loss_type == 'single'): write_predictions_single_labeling( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) elif (args.loss_type == 'origin') or (args.task == 'multi' and args.loss_type == 'squad'): write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) else: raise ValueError('{} dataset and {} loss is not support'.format( args.task, args.loss_type))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='./data', type=str, help="Data dir containing model dir, etc.") parser.add_argument( "--tfidf_file", default= 'wiki_first_paras-tfidf-ngram=2-hash=16777216-tokenizer=spacy.npz', type=str, help="td-idf .npz file placed inside the data_dir.") parser.add_argument( "--wiki_jsonl", default='wiki_firstpara_sents.jsonl', type=str, help="Processed wikipedia .jsonl placed inside data_dir.") parser.add_argument("--qdmr_jsonl", default='./data/qdmr_data/qdmrs_hotpotqa_gold.jsonl', type=str, help="Path to processed qdmr .jsonl file.") parser.add_argument("--predict_batch_size", default=128, type=int, help="Batch size for predictions in eval mode.") parser.add_argument("--tasks", default='break_rc,ques_ir,break_ir', type=str, help="The IR, RC tasks to perform.") parser.add_argument("--suffix", default='gold', type=str, help="Suffix to add to the output files.") parser.add_argument("--debug", action='store_true', help="If on, only keep a small number of qdmrs.") parser.add_argument( "--input_results_file", default='', type=str, help="File containing results of the task to be reused.") args = parser.parse_args() # we use an already finetuned single-hop RC ensemble by # Min et al (https://github.com/shmsw25/DecompRC/tree/master/DecompRC) rc_args = { 'bert_config_file': 'data/onehop_rc/uncased_L-12_H-768_A-12/bert_config.json', 'do_lower_case': True, 'doc_stride': 128, 'init_checkpoint': f'{args.data_dir}/onehop_rc/uncased_L-12_H-768_A-12/model1.pt,{args.data_dir}/onehop_rc/uncased_L-12_H-768_A-12/model2.pt,{args.data_dir}/onehop_rc/uncased_L-12_H-768_A-12/model3.pt', 'iterations_per_loop': 1000, 'local_rank': -1, 'max_answer_length': 30, 'max_n_answers': 5, 'max_query_length': 64, 'max_seq_length': 300, 'model': 'qa', 'n_best_size': 4, 'no_cuda': False, 'output_dropout_prob': 0, 'pooling': 'max', 'seed': 42, 'verbose_logging': False, 'vocab_file': 'data/onehop_rc/uncased_L-12_H-768_A-12/vocab.txt', 'with_key': False } rc_args = SimpleNamespace(**rc_args) # load hotpotQA logging.info(f'loading datasets from {args.data_dir}/hotpot_data/ ...') data = read_file(f'{args.data_dir}/hotpot_data/hotpot_train_v1.json') #data += read_file(f'{args.data_dir}/hotpot_data/hotpot_dev_distractor_v1.json') data += read_file( f'{args.data_dir}/hotpot_data/hotpot_dev_fullwiki_v1.json') for d in data: d['gold_titles'] = {x[0] for x in d['supporting_facts']} hotpot = {d['_id']: d for d in data} # load qdmr data processed using prepare_break.jsonl qdmr_path = args.qdmr_jsonl logging.info(f'loading processed qdmr data from {qdmr_path} ...') qdmrs = read_file(qdmr_path) # load spacy nlp = en_core_web_sm.load() # spacy tokenize = lambda s: [x.text for x in nlp.tokenizer(s)] # load IR logging.info('loading IR ...') ranker = IR(tfidf_path=f'{args.data_dir}/{args.tfidf_file}') # load wikipedia wiki_path = f'{args.data_dir}/{args.wiki_jsonl}' logging.info(f'loading wikipedia from {wiki_path} ...') with jsonlines.open(wiki_path, 'r') as reader: wiki = {d['title']: d['para'] for d in tqdm(reader.iter())} # prepare and load the RC for inference device = torch.device("cuda") n_gpu = torch.cuda.device_count() logging.info(f'{n_gpu} cuda devices available.') logging.info('loading 1-hop RC ensemble ...') random.seed(rc_args.seed) np.random.seed(rc_args.seed) torch.manual_seed(rc_args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(rc_args.seed) tokenizer = tokenization.FullTokenizer(vocab_file=rc_args.vocab_file, do_lower_case=rc_args.do_lower_case) bert_config = BertConfig.from_json_file(rc_args.bert_config_file) model = BertForQuestionAnswering(bert_config, 4) if rc_args.init_checkpoint is not None: model = [model] for i, checkpoint in enumerate(rc_args.init_checkpoint.split(',')): if i > 0: model.append(BertForQuestionAnswering(bert_config, 4)) logging.info(f"Loading from {checkpoint}") state_dict = torch.load(checkpoint, map_location='cpu') filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model[-1].load_state_dict(state_dict) model[-1].to(device) if type(model) == list: model = [m.eval() for m in model] else: model.eval() # 1hop RC wrapper simpQA = partial(_simpQA, args=args, rc_args=rc_args, tokenizer=tokenizer, model=model, device=device) if args.input_results_file: logging.info( f'Reading the supplied results file {args.input_results_file} ...') all_results = read_file(args.input_results_file) else: all_results = {} for i_d, d in enumerate(qdmrs): [_, data_split, _id] = d['question_id'].split('_') #assert data_split == 'dev' assert _id in hotpot all_results[_id] = d assert d['steps'] and d['op_types'], print( 'QDMRs must be pre-processed and non-empty.') if args.debug: all_results = { key: val for key, val in all_results.items() if random.random() < 0.01 } logging.info(f'\nTruncating to only {len(all_results)} samples!!!\n') tasks = [x.strip() for x in args.tasks.split(',')] if 'break_rc' in tasks: logging.info(f'Running BREAK IR+RC on {len(all_results)} samples ...') max_n_parts = max([len(v['steps']) for v in all_results.values()]) for i_p in range(max_n_parts): logging.info(f'Processing qdmr step #{i_p} ...') # process the i_p'th part of all samples articles = [] # hotpot articles corresponding to queries to the RC for _id, v in tqdm(all_results.items()): parts = v['steps'] if i_p >= (len(parts) - int( v['op_types'][-1] in ['COMPARISON', 'INTERSECTION'])): # the last discrete comparison, intersection step is processed later continue rc_outputs = v['rc_outputs'] if 'rc_outputs' in v else {} nbest_outputs = v[ 'nbest_outputs'] if 'nbest_outputs' in v else {} l_top = v['titles'] if 'titles' in v else [] part = parts[i_p] # replace placeholders with the respective RC outputs of previous parts for j in range(i_p): ph = '#' + str(j + 1) # 1...i_p if ph in part: part = part.replace(ph, rc_outputs[ph]) # get top 10 titles from IR top_titles = ranker.closest_docs(part, k=10)[0] l_top.append(top_titles) v.update({ 'titles': l_top, 'rc_outputs': rc_outputs, 'nbest_outputs': nbest_outputs }) context = [] # use all retrieved para for the sample instead of just current 10 & sort them acc to similarity wrt part set_l_top = set(sum(l_top, [])) scores = ranker.rank_titles(part, set_l_top) sorted_l_top = sorted(scores.keys(), key=lambda title: scores[title], reverse=True) for title in sorted_l_top: context.append([title, wiki[title]['sents'] ]) # get para from wiki if not sorted_l_top: # rare case of no valid titles context = [['Random Title 1', 'Random Text 1'], ['Random Title 2', 'Random Text 2']] d, article = hotpot[_id], {} article['question'], article[ 'context'] = part + ' ?', context # appending '?' to part query article.update({ k: d[k] for k in ['_id', 'type', 'answer'] }) # '_id', 'type', 'context', 'question', 'answer' articles.append(article) if not articles: continue # querying the 1-hop RC all_nbest_out = simpQA([to_squad(article) for article in articles])[1] for _id, v in all_results.items(): if _id not in all_nbest_out: continue nbest_i_p = all_nbest_out[_id] op = v['op_types'][i_p] nbest_id = v['nbest_outputs'] # handle filter steps if 'FILTER' in op: ref_ph = op.split('_')[1] nbest_ref = Counter(nbest_id[ref_ph]) # accumulating the logits of nbest of the part and the ref part nbest_ref.update(nbest_i_p) nbest_i_p = dict(nbest_ref) rc_out = max(nbest_i_p.keys(), key=lambda key: nbest_i_p[key]) v['rc_outputs'][f'#{i_p+1}'] = rc_out v['nbest_outputs'][f'#{i_p+1}'] = nbest_i_p # discrete processing of the last comparison step logging.info( f'Discrete processing of the last comparison/intersection steps ...' ) for _id, v in all_results.items(): if v['op_types'][-1] not in ['COMPARISON', 'INTERSECTION']: continue question, answer, gold_titles = hotpot[_id]['question'], hotpot[ _id]['answer'], hotpot[_id]['gold_titles'] parts, rc_outputs = v['steps'], v['rc_outputs'] if v['op_types'][-1] == 'COMPARISON': ents, rc_outs = [], [] for i_p, part in enumerate(parts[:-1]): # get named entity in the part part_without_phs = part for x in ['#' + str(j) for j in range(1, 8)]: part_without_phs = part_without_phs.replace(x, '') ent = get_ent(part_without_phs, nlp, only_longest=True) ent = '' if ent is None else ent ents.append(ent) rc_outs.append( normalize_answer(rc_outputs['#' + str(i_p + 1)])) if 'same as' in parts[-1]: pred_ans = 'yes' if rc_outs[-2] == rc_outs[-1] else 'no' else: pred_ans = ents[compare(parts[-1], rc_outs[-2], rc_outs[-1])] v['rc_outputs'][f'#{len(parts)}'] = pred_ans elif v['op_types'][-1] == 'INTERSECTION': part = parts[-1] phs = [ '#' + str(j) for j in range(1, 10) if '#' + str(j) in part ] phs = list(set(phs)) # accumulate logits of the parts and take the argmax nbest_id = v['nbest_outputs'] nbest = Counter(nbest_id[phs[0]]) # accumulate logits for ph in phs[1:]: if ph in nbest_id: nbest.update(nbest_id[ph]) nbest = dict(nbest) pred_ans = max(nbest.keys(), key=lambda key: nbest[key]) v['rc_outputs'][f'#{len(parts)}'] = pred_ans v['nbest_outputs'][f'#{len(parts)}'] = nbest for v in all_results.values(): assert len(v['rc_outputs']) == len(v['steps']) if 'break_ir' in tasks: # this can only be run after break_rc task & requires all_results dict logging.info( f'Forming context using the titles used by Break RC for {len(all_results)} samples ...' ) # prepare hotpot-like data for Bert RC new_hotpot = [] for _id, v in tqdm(all_results.items()): d = hotpot[_id] d_new = deepcopy(d) used_titles = sum(v['titles'], []) # sort wrt similarity to ques scores = ranker.rank_titles(d['question'], set(used_titles)) titles = sorted(scores.keys(), key=lambda title: scores[title], reverse=True) context = [] for title in titles: context.append([title, wiki[title]['sents']]) d_new['context'] = context if 'gold_titles' in d_new: del d_new['gold_titles'] new_hotpot.append(d_new) out_break_ir_file = f'{args.data_dir}/hotpot_data/hotpot_after_break_ir_{args.suffix}.json' logging.info( f'Writing hotpot version with the Break IR context to {out_break_ir_file} ...' ) write_file(new_hotpot, out_break_ir_file) # store the retrieved titles for d in new_hotpot: all_results[d['_id']]['titles_found_by_break_rc'] = list( set([x[0] for x in d['context']])) if 'ques_ir' in tasks: # this can only be run after break_rc task & requires all_results dict formed # to determine the number of titles to be retrieved for each sample logging.info( f'Running baseline IR using the whole question for {len(all_results)} samples ...' ) # prepare hotpot-like data for Bert RC new_hotpot = [] for _id in tqdm(all_results.keys()): d = hotpot[_id] d_new = deepcopy(d) # for fair comparison with Break RC retrieve the same number of titles n_titles = len(sum(all_results[_id]['titles'], [])) titles = ranker.closest_docs(d['question'], k=n_titles)[0] context = [] for title in titles: context.append([title, wiki[title]['sents']]) d_new['context'] = context if 'gold_titles' in d_new: del d_new['gold_titles'] new_hotpot.append(d_new) out_ques_ir_file = f'{args.data_dir}/hotpot_data/hotpot_after_ques_ir_{args.suffix}.json' logging.info( f'Writing hotpot version with the baseline IR context to {out_ques_ir_file} ...' ) write_file(new_hotpot, out_ques_ir_file) # store the retrieved titles for d in new_hotpot: all_results[d['_id']]['titles_found_using_whole_ques'] = list( set([x[0] for x in d['context']])) # save the Break RC outputs out_break_rc_file = f'{args.data_dir}/predictions/break_rc_results_{args.suffix}.json' logging.info(f'Writing the break RC results to {out_break_rc_file}...') os.makedirs(dirname(out_break_rc_file), exist_ok=True) write_file(all_results, out_break_rc_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForQuestionAnswering(bert_config) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) if not args.optimize_on_cpu: model.to(device) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 if args.do_train: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.optimize_on_cpu: model.to('cpu') optimizer.step() # We have accumulated enought gradients model.zero_grad() if args.optimize_on_cpu: model.to(device) global_step += 1 if args.do_predict: eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
device = "cpu" model.to(device) def question_answering(doc_text, question_text, model, tokenizer): example = compose_question(doc_text, question_text) predictions = predict_answer(example, model, tokenizer, top_n=3) results = [] for p in predictions: results.append((p.text, str((p.start_logit + p.end_logit) / 2))) return results def question_answering_terminal(): doc_text = input('Input document String: ') question_text = input('Input question String: ') example = compose_question(doc_text, question_text) predictions = predict_answer(example, top_n=3) results = [] for p in predictions: results.append((p.text, str((p.start_logit + p.end_logit) / 2))) return results if __name__ == "__main__": model = BertForQuestionAnswering.from_pretrained("./bert_squad/") tokenizer = BertTokenizer.from_pretrained("./bert_squad/", do_lower_case=True) device = "cpu" model.to(device) print(question_answering_terminal())
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_config_file", default='model_repo/uncased_L-12_H-768_A-12/bert_config.json', type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default='model_repo/uncased_L-12_H-768_A-12/vocab.txt', type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default='output', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--processed_data", default='processed', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=True, action='store_true', help="Whether to run eval on the dev set.") ## Other parameters parser.add_argument("--train_file", default='BioASQ-train-factoid-4b.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default='ASQdev.json', type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--init_checkpoint", default='model_repo/uncased_L-12_H-768_A-12/pytorch_model.bin', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--finetuned_checkpoint", default='ft_dir/ft_model.bin', type=str, help="finetuned checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--max_seq_length", default=500, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--max_answer_length", default=500, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--accumulate_gradients", type=int, default=1, help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) if not os.path.exists(args.finetuned_checkpoint): os.makedirs(args.finetuned_checkpoint, exist_ok=True) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) num_train_steps = None if args.do_train: logger.info('Load and process train examples') if os.path.exists(os.path.join(args.processed_data, 'processed_train.pkl')): with open(os.path.join(args.processed_data, 'processed_train.pkl'), 'rb') as f: train_features, train_examples, num_train_steps = pickle.load(f) else: train_examples = read_squad_examples( input_file=args.train_file, tokenizer=tokenizer, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) train_path = os.path.join(args.processed_data, 'processed_train.pkl') with open(train_path, 'wb') as f: pickle.dump([train_features, train_examples, num_train_steps], f) if args.do_predict: logger.info('Load and process dev examples') if os.path.exists(os.path.join(args.processed_data, 'processed_dev.pkl')): with open(os.path.join(args.processed_data, 'processed_dev.pkl'), 'rb') as f: eval_features, eval_examples = pickle.load(f) else: eval_examples = read_squad_examples( input_file=args.predict_file, tokenizer=tokenizer, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) eval_path = os.path.join(args.processed_data, 'processed_dev.pkl') with open(eval_path, 'wb') as f: pickle.dump([eval_features, eval_examples], f) model = BertForQuestionAnswering(bert_config) if args.do_train and args.init_checkpoint is not None: logger.info('Loading init checkpoint') model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) logger.info('Loaded init checkpoint') elif args.do_predict: logger.info('Loading fine-tuned checkpoint') state_dict = torch.load(args.finetuned_checkpoint, map_location='cpu') new_state_dict = collections.OrderedDict() for key, value in state_dict.items(): new_state_dict[key[7:]] = value model.load_state_dict(new_state_dict) del state_dict del new_state_dict logger.info('Loaded fine-tuned checkpoint') model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() best_dev_score = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if (step + 1) % args.save_checkpoints_steps == 0: best_dev_score = run_evaluate(args, model, eval_features, device, eval_examples, tokenizer, best_dev_score) logger.info('Best dev score {} in steps {}:'.format(best_dev_score, step)) if args.do_predict: run_evaluate(args, model, eval_features, device, eval_examples, tokenizer, best_dev_score )
def load_model(bert_model, model_file, **kwargs): # Load a trained model that you have fine-tuned model_state_dict = torch.load(model_file) model = BertForQuestionAnswering.from_pretrained( bert_model, state_dict=model_state_dict, **kwargs) return model
def init_params(): model = BertForQuestionAnswering.from_pretrained("./bert_squad/") tokenizer = BertTokenizer.from_pretrained("./bert_squad/", do_lower_case=True) device = "cpu" model.to(device)
def main(): parser = argparse.ArgumentParser() BERT_DIR = "./model/uncased_L-12_H-768_A-12/" ## Required parameters parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \ type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \ help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default="out", type=str, \ help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--train_file", type=str, \ help="SQuAD json for training. E.g., train-v1.1.json", \ default="") parser.add_argument("--predict_file", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \ default="") parser.add_argument("--init_checkpoint", type=str, help="Initial checkpoint (usually from a pre-trained BERT model).", \ default=BERT_DIR+"pytorch_model.bin") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=300, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=128, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=3, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--eval_period', type=int, default=2000) parser.add_argument('--max_n_answers', type=int, default=5) parser.add_argument('--merge_query', type=int, default=-1) parser.add_argument('--reduce_layers', type=int, default=-1) parser.add_argument('--reduce_layers_to_tune', type=int, default=-1) parser.add_argument('--only_comp', action="store_true", default=False) parser.add_argument('--train_subqueries_file', type=str, default="") #500 parser.add_argument('--predict_subqueries_file', type=str, default="") #500 parser.add_argument('--prefix', type=str, default="") #500 parser.add_argument('--model', type=str, default="qa") #500 parser.add_argument('--pooling', type=str, default="max") parser.add_argument('--debug', action="store_true", default=False) parser.add_argument('--output_dropout_prob', type=float, default=0) parser.add_argument('--wait_step', type=int, default=30) parser.add_argument('--with_key', action="store_true", default=False) parser.add_argument('--add_noise', action="store_true", default=False) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if not args.predict_file: raise ValueError( "If `do_train` is True, then `predict_file` must be specified." ) if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.do_train and args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): logger.info("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None eval_dataloader, eval_examples, eval_features, _ = get_dataloader( logger=logger, args=args, input_file=args.predict_file, subqueries_file=args.predict_subqueries_file, is_training=False, batch_size=args.predict_batch_size, num_epochs=1, tokenizer=tokenizer) if args.do_train: train_dataloader, train_examples, _, num_train_steps = get_dataloader( logger=logger, args=args, \ input_file=args.train_file, \ subqueries_file=args.train_subqueries_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer) #a = input() if args.model == 'qa': model = BertForQuestionAnswering(bert_config, 4) metric_name = "F1" elif args.model == 'classifier': if args.reduce_layers != -1: bert_config.num_hidden_layers = args.reduce_layers model = BertClassifier(bert_config, 2, args.pooling) metric_name = "F1" elif args.model == "span-predictor": if args.reduce_layers != -1: bert_config.num_hidden_layers = args.reduce_layers if args.with_key: Model = BertForQuestionAnsweringWithKeyword else: Model = BertForQuestionAnswering model = Model(bert_config, 2) metric_name = "Accuracy" else: raise NotImplementedError() if args.init_checkpoint is not None and args.do_predict and \ len(args.init_checkpoint.split(','))>1: assert args.model == "qa" model = [model] for i, checkpoint in enumerate(args.init_checkpoint.split(',')): if i > 0: model.append(BertForQuestionAnswering(bert_config, 4)) print("Loading from", checkpoint) state_dict = torch.load(checkpoint, map_location='cpu') filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model[-1].load_state_dict(state_dict) model[-1].to(device) else: if args.init_checkpoint is not None: print("Loading from", args.init_checkpoint) state_dict = torch.load(args.init_checkpoint, map_location='cpu') if args.reduce_layers != -1: state_dict = {k:v for k, v in state_dict.items() \ if not '.'.join(k.split('.')[:3]) in \ ['encoder.layer.{}'.format(i) for i in range(args.reduce_layers, 12)]} if args.do_predict: filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model.load_state_dict(state_dict) else: model.bert.load_state_dict(state_dict) if args.reduce_layers_to_tune != -1: model.bert.embeddings.required_grad = False n_layers = 12 if args.reduce_layers == -1 else args.reduce_layers for i in range(n_layers - args.reduce_layers_to_tune): model.bert.encoder.layer[i].require_grad = False model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 best_f1 = 0 wait_step = 0 model.train() global_step = 0 stop_training = False for epoch in range(int(args.num_train_epochs)): for step, batch in tqdm(enumerate(train_dataloader)): global_step += 1 batch = [t.to(device) for t in batch] loss = model(batch, global_step) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if global_step % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() if global_step % args.eval_period == 0: model.eval() f1 = predict(args, model, eval_dataloader, eval_examples, eval_features, \ device, write_prediction=False) logger.info("%s: %.3f on epoch=%d" % (metric_name, f1 * 100.0, epoch)) if best_f1 < f1: logger.info("Saving model with best %s: %.3f -> %.3f on epoch=%d" % \ (metric_name, best_f1*100.0, f1*100.0, epoch)) model_state_dict = { k: v.cpu() for (k, v) in model.state_dict().items() } torch.save( model_state_dict, os.path.join(args.output_dir, "best-model.pt")) model = model.cuda() best_f1 = f1 wait_step = 0 stop_training = False else: wait_step += 1 if best_f1 > 0.1 and wait_step == args.wait_step: stop_training = True model.train() if stop_training: break elif args.do_predict: if type(model) == list: model = [m.eval() for m in model] else: model.eval() f1 = predict(args, model, eval_dataloader, eval_examples, eval_features, device) logger.info("Final %s score: %.3f%%" % (metric_name, f1 * 100.0))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_evaluate", action='store_true', help="Whether to eval after training.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=32, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--num_train_samples", default=-1, type=int, help="Total number of training samples used.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( '--version_2_with_negative', action='store_true', help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--eval_period', type=int, default=2000) parser.add_argument('--wait_step', type=int, default=7) parser.add_argument('--load_from_cache', action='store_true', help="Load train features from cache.") parser.add_argument('--indiv_digits', action='store_true', help="Tokenize numbers into indiv digits.") parser.add_argument('--use_segment_ids', action='store_true', help="Use segment ids.") args = parser.parse_args() #print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: make_output_dir( args, scripts_to_save=[sys.argv[0], 'run_squad_dataset_utils.py']) if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) with open(args.predict_file, encoding='utf-8') as pf: dev_data = json.load(pf)["data"] if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab if args.do_train: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) model = BertForQuestionAnswering.from_pretrained(args.bert_model) elif args.do_evaluate: # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) if args.local_rank == 0: torch.distributed.barrier() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(os.path.join(args.output_dir, 'log')) # tensorboard # Prepare data loader train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative, num_train_samples=args.num_train_samples) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), str(args.num_train_samples)) try: if args.load_from_cache: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) else: raise Exception except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True, indiv_digits=args.indiv_digits) if args.local_rank == -1: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.do_predict and args.local_rank == -1: eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False, indiv_digits=args.indiv_digits) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) if args.do_train: best_f1 = 0 wait_step = 0 global_step = 0 stop_training = False do_eval = False model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): if stop_training: break for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch segment_ids = segment_ids if args.use_segment_ids else None loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.eval_period == 0: do_eval = True if do_eval: do_eval = False model.eval() scores = predict(args, model, eval_examples, eval_features, eval_dataloader, dev_data, device) em, f1 = scores['exact_match'], scores['f1'] logger.info("f1: %.3f, em: %.3f on epoch=%d" % (f1, em, epoch)) print("f1: %.3f, em: %.3f on epoch=%d" % (f1, em, epoch)) if best_f1 < f1: logger.info("Saving model with best f1: %.3f -> %.3f on epoch=%d" % \ (best_f1, f1, epoch)) save_model(args, model, tokenizer) model.to(device) best_f1 = f1 wait_step = 0 stop_training = False else: wait_step += 1 if best_f1 > 86 and wait_step == args.wait_step: stop_training = True break model.train() # end of epoch do_eval = True elif args.do_evaluate: model.eval() scores = predict(args, model, eval_examples, eval_features, eval_dataloader, dev_data, device, True) em, f1 = scores['exact_match'], scores['f1'] print("f1: %.3f, em: %.3f" % (f1, em))