Exemplo n.º 1
0
def eval_all():

#    output_model_file = "../../output/best_model"
    output_model_file = MODEL_PATH
    output_config_file = os.path.join('../model_dir/', args.config_name)

    config = BertConfig(output_config_file)
    model = BertForQuestionAnswering(config)
    if not args.no_pai:
        try:
            model.load_state_dict(torch.load(output_model_file))#, map_location='cpu'))
        except:
            model = nn.DataParallel(model)
            model.load_state_dict(torch.load(output_model_file))#, map_location='cpu'))
    else:
        try:
            model.load_state_dict(torch.load(output_model_file, map_location='cpu'))
        except:
            model = nn.DataParallel(model)
            model.load_state_dict(torch.load(output_model_file, map_location='cpu'))

    result_file_path = os.path.join('../metric', args.result_file_name)
    evaluate(model, result_file=result_file_path)
    if not args.no_pai:
        print(os.getcwd())
        pai_file_output = "/Container/thsi_yicui/dureader-bert/Dureader/output"
        client.upload(pai_file_output, result_file_path, overwrite=True)
Exemplo n.º 2
0
def eval_all():

    output_model_file = "../model_dir/best_model"
    output_config_file = "../model_dir/bert_config.json"

    config = BertConfig(output_config_file)
    model = BertForQuestionAnswering(config)
    model.load_state_dict(
        torch.load(output_model_file))  #, map_location='cpu'))
    evaluate(model.cpu(), result_file="../metric/predicts.json")
Exemplo n.º 3
0
 def __init__(self, model_path, lower_case=True):
     self.model_path = model_path
     self.tokenizer = BertTokenizer.from_pretrained(
         model_path, do_lower_case=lower_case)
     self.model = BertForQuestionAnswering.from_pretrained(model_path)
     #self.model.cuda()
     self.model.eval()
Exemplo n.º 4
0
	def __init__(self, model_path, lower_case = True):
		self.model_path = model_path
		self.tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=lower_case)
		self.model = BertForQuestionAnswering.from_pretrained(model_path)
		self.device = torch.device("cuda")
		self.model.to(self.device)
		self.model.eval()
Exemplo n.º 5
0
def eval_all():

    #    output_model_file = "../../output/best_model"
    output_model_file = MODEL_PATH
    output_config_file = CONFIG_PATH

    config = BertConfig(output_config_file)
    model = BertForQuestionAnswering(config)
    if next(model.parameters()).is_cuda:
        try:
            model.load_state_dict(torch.load(output_model_file))
        except:
            model = nn.DataParallel(model)
            model.load_state_dict(torch.load(output_model_file))
    else:
        try:
            model.load_state_dict(
                torch.load(output_model_file, map_location='cpu'))
        except:
            model = nn.DataParallel(model)
            model.load_state_dict(
                torch.load(output_model_file, map_location='cpu'))

    result_file_path = os.path.join('../metric', args.result_file_name)
    evaluate(model, result_file=result_file_path)
def initialize_model(args):
    ''' return model, ready to trace '''
    config = BertConfig.from_json_file(args.config_file)
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)
    model = BertForQuestionAnswering(config)
    model.enable_apex(False)
    state_dict = torch.load(args.checkpoint, map_location='cpu')["model"]
    model.load_state_dict(state_dict)
    if args.fp16:
        model.half()
    return model
Exemplo n.º 7
0
def eval_all():
    output_model_file = "../model_dir/best_model"
    output_config_file = "../model_dir/bert_configbase.json"

    config = BertConfig(output_config_file)
    model = BertForQuestionAnswering(config)
    # 针对多卡训练加载模型的方法:
    state_dict = torch.load(output_model_file, map_location='cuda:0')
    # 初始化一个空 dict
    new_state_dict = OrderedDict()
    # 修改 key,没有module字段则需要不上,如果有,则需要修改为 module.features
    for k, v in state_dict.items():
        if 'module' not in k:
            k = k
        else:
            k = k.replace('module.', '')
        new_state_dict[k] = v
    model.load_state_dict(new_state_dict)
    # model.load_state_dict(torch.load(output_model_file)) #, map_location='cpu'))
    evaluate(model.cpu(), result_file="../metric/predicts_dev.json")
Exemplo n.º 8
0
    def get_predictor_model(cls):

        config = BertConfig.from_json_file(config_file)
        model = BertForQuestionAnswering(config)
        model.load_state_dict(
            torch.load(MODEL_PATH, map_location='cpu')["model"])
        model.to(device)
        cls.model = model

        return cls.model
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--bert_config_file", default=None, type=str, required=True,
                        help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default=None, type=str, required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument("--predict_file", default=None, type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--init_checkpoint", default=None, type=str,
                        help="Init from checkpoint")
    parser.add_argument("--init_full_model", default=None, type=str,
                        help="Initial full model")
    parser.add_argument("--do_lower_case", default=True, action='store_true',
                        help="Whether to lower case the input text. Should be True for uncased "
                             "models and False for cased models.")
    parser.add_argument("--max_seq_length", default=384, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--doc_stride", default=128, type=int,
                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
    parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
    parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.0, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
                             "of training.")
    parser.add_argument("--save_checkpoints_steps", default=2000, type=int,
                        help="How often to save the model checkpoint")
    parser.add_argument("--iterations_per_loop", default=1000, type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument("--n_best_size", default=3, type=int,
                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
                             "output file.")
    parser.add_argument("--max_answer_length", default=100, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
    parser.add_argument("--use_history", default=False, action='store_true',
                        help="Use history features")
    parser.add_argument("--verbose_logging", default=False, action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--optimize_on_cpu',
                        default=False,
                        action='store_true',
                        help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=128,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    parser.add_argument('--dry_run',
                        action='store_true', default=False,
                        help='Don\'t load model, just load data')

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info("16-bits training currently not supported in distributed training")
            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError("At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("Warning: output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = bdu.read_coqa_examples(
            input_file=args.train_file, is_training=True)
        real_train_example_len = sum(len(ex['questions']) for ex in train_examples)
        num_train_steps = int(
            real_train_example_len / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    if not args.dry_run:
        if args.init_full_model is not None:
            model_state_dict = torch.load(args.init_full_model)
            model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)
        else:
            model = BertForQuestionAnswering(bert_config, use_history=args.use_history)
            if args.init_checkpoint is not None:
                model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                              output_device=args.local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Prepare optimizer
        if args.fp16:
            param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                                for n, param in model.named_parameters()]
        elif args.optimize_on_cpu:
            param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                                for n, param in model.named_parameters()]
        else:
            param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
            ]
        optimizer = BERTAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        train_features = bdu.convert_coqa_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", real_train_example_len)
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
        all_f_history = torch.tensor([f.f_history for f in train_features], dtype=torch.uint8)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                   all_start_positions, all_end_positions, all_f_history)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"):
            running_loss = []
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions, f_history = batch
                # Convert to float here.
                f_history_32 = f_history.float()
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions=start_positions, end_positions=end_positions,
                             f_history=f_history_32,
                             debug=True)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                running_loss.append(loss.item())
                if step % 40 == 0:
                    logger.info("epoch {} step {}: avg loss {}".format(epoch_i, step, sum(running_loss) / len(running_loss)))
                    running_loss = []
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                param.grad.data = param.grad.data / args.loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            args.loss_scale = args.loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1
                if global_step % (args.save_checkpoints_steps // args.train_batch_size) == 0:
                    model_name = os.path.join(args.output_dir, 'model-{}.pth'.format(global_step))
                    # Save a trained model
                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                    print("Step {}: saving model to {}".format(global_step, model_name))
                    torch.save(model_to_save.state_dict(), model_name)

        model_name = os.path.join(args.output_dir, 'model-{}.pth'.format(global_step))
        print("Step {}: saving model to {}".format(global_step, model_name))
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), model_name)

    if args.do_predict:
        eval_examples = bdu.read_coqa_examples(
            input_file=args.predict_file, is_training=False)
        eval_features = bdu.convert_coqa_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        all_f_history = torch.tensor([f.f_history for f in eval_features], dtype=torch.uint8)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_f_history)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices, f_history in tqdm(eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            f_history = f_history.to(device)
            with torch.no_grad(),:
                f_history_32 = f_history.float()
                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask, f_history=f_history_32)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(RawResult(unique_id=unique_id,
                                             coqa_id=eval_feature.coqa_id,
                                             turn_id=eval_feature.turn_id,
                                             start_logits=start_logits,
                                             end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
        bdu.write_predictions(eval_examples, eval_features, all_results,
                              args.n_best_size, args.max_answer_length,
                              args.do_lower_case, output_prediction_file,
                              output_nbest_file, args.verbose_logging)
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--bert_model", default="bert-base-cased", type=str,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--output_dir", default="/Users/lifuh/Documents/Research/squad2.0/output/", type=str,
                        help="The output directory where the model checkpoints and predictions will be written.")

    ## Other parameters
    parser.add_argument("--train_file", default="/Users/lifuh/Documents/Research/squad2.0/train-v2.0.json",
                        type=str, help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument("--predict_file", default="/Users/lifuh/Documents/Research/squad2.0/dev-v2.0.json", type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
    parser.add_argument("--do_predict", default=True, action='store_true', help="Whether to run eval on the dev set.")

    parser.add_argument("--max_seq_length", default=384, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--doc_stride", default=128, type=int,
                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
                             "of training.")
    parser.add_argument("--n_best_size", default=20, type=int,
                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
                             "output file.")
    parser.add_argument("--max_answer_length", default=30, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
    parser.add_argument("--verbose_logging", default=False, action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--null_score_diff_threshold',
                        type=float, default=0.0,
                        help="If null_score - best_non_null is greater than the threshold predict null.")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError("At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    # initialize a tokenizer from a pretrained model
    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_squad_examples(
            input_file=args.train_file, is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
            args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
        train_features = None
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s", cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
        all_is_impossibles = torch.tensor([int(f.is_impossible) for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                   all_start_positions, all_end_positions, all_is_impossibles)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions, _ = batch
                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(output_model_file)
    model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)
    model.to(device)

    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(
            input_file=args.predict_file, is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(RawResult(unique_id=unique_id,
                                             start_logits=start_logits,
                                             end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file, args.verbose_logging, True, args.null_score_diff_threshold)
Exemplo n.º 11
0
def function_main_bert(q_id, context_ans, question):
    bert_config_file = 'bert_config.json'
    vocab_file = 'vocab.txt'
    #    output_dir='output'
    #    processed_data = 'processed'
    #predict_file='ASQdev.json'
    finetuned_checkpoint = 'ft_model_bert.bin'
    max_seq_length = 500
    do_lower_case = True
    local_rank = -1
    seed = 42
    n_best_size = 20
    predict_batch_size = 8
    max_answer_length = 500
    max_query_length = 64
    doc_stride = 128
    max_seq_length = 500
    final_answer = ""
    probs = 0.0
    no_cuda = False

    if local_rank == -1 or no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)

    eval_examples = read_squad_examples(id=q_id,
                                        paragraph=context_ans,
                                        question=question,
                                        tokenizer=tokenizer)
    eval_features = convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length)

    model = BertForQuestionAnswering(bert_config)

    state_dict = torch.load(finetuned_checkpoint, map_location='cpu')
    new_state_dict = collections.OrderedDict()
    for key, value in state_dict.items():
        new_state_dict[key[:]] = value
    model.load_state_dict(new_state_dict)
    del state_dict
    del new_state_dict
    model.to(device)

    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    final_answer, probs, start_logits, end_logits, eval_feature = run_evaluate(
        local_rank, predict_batch_size, n_best_size, max_answer_length,
        do_lower_case, model, eval_features, device, eval_examples, tokenizer)
    return final_answer, probs, start_logits, end_logits, eval_feature
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--init_checkpoint",
                        default=None,
                        type=str,
                        required=True,
                        help="The checkpoint file from pretraining")

    ## Other parameters
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
    )
    parser.add_argument("--seed", default=1, type=int)
    parser.add_argument(
        "--question",
        default=
        "Most antibiotics target bacteria and don't affect what class of organisms? ",
        type=str,
        help="question")
    parser.add_argument(
        "--context",
        default=
        "Within the genitourinary and gastrointestinal tracts, commensal flora serve as biological barriers by competing with pathogenic bacteria for food and space and, in some cases, by changing the conditions in their environment, such as pH or available iron. This reduces the probability that pathogens will reach sufficient numbers to cause illness. However, since most antibiotics non-specifically target bacteria and do not affect fungi, oral antibiotics can lead to an overgrowth of fungi and cause conditions such as a vaginal candidiasis (a yeast infection). There is good evidence that re-introduction of probiotic flora, such as pure cultures of the lactobacilli normally found in unpasteurized yogurt, helps restore a healthy balance of microbial populations in intestinal infections in children and encouraging preliminary data in studies on bacterial gastroenteritis, inflammatory bowel diseases, urinary tract infection and post-surgical infections. ",
        type=str,
        help="context")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument(
        "--n_best_size",
        default=1,
        type=int,
        help="The total number of n-best predictions to generate. ")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        '--version_2_with_negative',
        action='store_true',
        help='If true, then the model can reply with "unknown". ')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=-11.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict 'unknown'. "
    )
    parser.add_argument(
        '--vocab_file',
        type=str,
        default=None,
        required=True,
        help="Vocabulary mapping/file BERT was pretrainined on")
    parser.add_argument("--config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The BERT model config")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="use mixed-precision")
    parser.add_argument("--local_rank",
                        default=-1,
                        help="ordinal of the GPU to use")

    args = parser.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

    tokenizer = BertTokenizer(args.vocab_file,
                              do_lower_case=args.do_lower_case,
                              max_len=512)  # for bert large

    # Prepare model
    config = BertConfig.from_json_file(args.config_file)

    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)

    # initialize model
    model = BertForQuestionAnswering(config)
    model.load_state_dict(
        torch.load(args.init_checkpoint, map_location='cpu')["model"])
    model.to(device)
    if args.fp16:
        model.half()
    model.eval()

    print("question: ", args.question)
    print("context: ", args.context)
    print()

    # preprocessing
    doc_tokens = args.context.split()
    query_tokens = tokenizer.tokenize(args.question)
    feature = preprocess_tokenized_text(doc_tokens,
                                        query_tokens,
                                        tokenizer,
                                        max_seq_length=args.max_seq_length,
                                        max_query_length=args.max_query_length)

    tensors_for_inference, tokens_for_postprocessing = feature

    input_ids = torch.tensor(tensors_for_inference.input_ids,
                             dtype=torch.long).unsqueeze(0)
    segment_ids = torch.tensor(tensors_for_inference.segment_ids,
                               dtype=torch.long).unsqueeze(0)
    input_mask = torch.tensor(tensors_for_inference.input_mask,
                              dtype=torch.long).unsqueeze(0)

    # load tensors to device
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)

    # run prediction
    with torch.no_grad():
        start_logits, end_logits = model(input_ids, segment_ids, input_mask)

    # post-processing
    start_logits = start_logits[0].detach().cpu().tolist()
    end_logits = end_logits[0].detach().cpu().tolist()
    answer, answers = get_answer(doc_tokens, tokens_for_postprocessing,
                                 start_logits, end_logits, args)

    # print result
    print()
    print(answer)
    print()
    print(json.dumps(answers, indent=4))
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()
    BERT_DIR = "uncased_L-12_H-768_A-12/"
    ## Required parameters
    parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \
                        type=str, help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default="out", type=str, \
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--load", default=False, action='store_true')
    parser.add_argument("--train_file", type=str, \
                        help="SQuAD json for training. E.g., train-v1.1.json", \
                        default="/home/sewon/data/squad/train-v1.1.json")
    parser.add_argument("--predict_file", type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \
                        default="/home/sewon/data/squad/dev-v1.1.json")
    parser.add_argument("--init_checkpoint", type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).", \
                        default=BERT_DIR+"pytorch_model.bin")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=300,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=39,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=300,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=1000.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=3,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument('--eval_period', type=int, default=500)
    parser.add_argument('--max_n_answers', type=int, default=20)
    parser.add_argument('--n_paragraphs', type=str, default='40')
    parser.add_argument('--verbose', action="store_true", default=False)
    parser.add_argument('--wait_step', type=int, default=12)

    # Learning method variation
    parser.add_argument('--loss_type', type=str, default="mml")
    parser.add_argument('--tau', type=float, default=12000.0)

    # For evaluation
    parser.add_argument('--prefix', type=str, default="")  #500
    parser.add_argument('--debug', action="store_true", default=False)

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO,
        handlers=[
            logging.FileHandler(os.path.join(args.output_dir, "log.txt")),
            logging.StreamHandler()
        ])
    logger = logging.getLogger(__name__)
    logger.info(args)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
        if not args.predict_file:
            raise ValueError(
                "If `do_train` is True, then `predict_file` must be specified."
            )

    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.do_train and args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    model = BertForQuestionAnswering(bert_config,
                                     device,
                                     4,
                                     loss_type=args.loss_type,
                                     tau=args.tau)
    metric_name = "EM"

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    train_split = ',' in args.train_file
    if train_split:
        n_train_files = len(args.train_file.split(','))

    eval_dataloader, eval_examples, eval_features, _ = get_dataloader(
        logger=logger,
        args=args,
        input_file=args.predict_file,
        is_training=False,
        batch_size=args.predict_batch_size,
        num_epochs=1,
        tokenizer=tokenizer)

    if args.do_train:
        train_file = args.train_file
        if train_split:
            train_file = args.train_file.split(',')[0]
        train_dataloader, _, _, num_train_steps = get_dataloader(
                logger=logger, args=args, \
                input_file=train_file, \
                is_training=True,
                batch_size=args.train_batch_size,
                num_epochs=args.num_train_epochs,
                tokenizer=tokenizer)

    if args.init_checkpoint is not None:
        logger.info("Loading from {}".format(args.init_checkpoint))
        state_dict = torch.load(args.init_checkpoint, map_location='cpu')
        if args.do_train and args.init_checkpoint.endswith(
                'pytorch_model.bin'):
            model.bert.load_state_dict(state_dict)
        else:
            filter = lambda x: x[7:] if x.startswith('module.') else x
            state_dict = {filter(k): v for (k, v) in state_dict.items()}
            model.load_state_dict(state_dict)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [{
            'params':
            [p for n, p in model.named_parameters() if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in model.named_parameters() if n in no_decay],
            'weight_decay_rate':
            0.0
        }]

        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

        global_step = 0

        best_f1 = (-1, -1)
        wait_step = 0
        model.train()
        global_step = 0
        stop_training = False
        train_losses = []

        for epoch in range(int(args.num_train_epochs)):
            if epoch > 0 and train_split:
                train_file = args.train_file.split(',')[epoch % n_train_files]
                train_dataloader = get_dataloader(
                        logger=logger, args=args, \
                        input_file=train_file, \
                        is_training=True,
                        batch_size=args.train_batch_size,
                        num_epochs=args.num_train_epochs,
                        tokenizer=tokenizer)[0]

            for step, batch in enumerate(train_dataloader):
                global_step += 1
                batch = [t.to(device) for t in batch]
                loss = model(batch, global_step)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                train_losses.append(loss.detach().cpu())
                loss.backward()
                if global_step % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                if global_step % args.eval_period == 0:
                    model.eval()
                    f1 =  predict(logger, args, model, eval_dataloader, eval_examples, eval_features, \
                                  device, write_prediction=False)
                    logger.info(
                        "Step %d Train loss %.2f EM %.2f F1 %.2f on epoch=%d" %
                        (global_step, np.mean(train_losses), f1[0] * 100,
                         f1[1] * 100, epoch))
                    train_losses = []
                    if best_f1 < f1:
                        logger.info("Saving model with best %s: %.2f (F1 %.2f) -> %.2f (F1 %.2f) on epoch=%d" % \
                                (metric_name, best_f1[0]*100, best_f1[1]*100, f1[0]*100, f1[1]*100, epoch))
                        model_state_dict = {
                            k: v.cpu()
                            for (k, v) in model.state_dict().items()
                        }
                        torch.save(
                            model_state_dict,
                            os.path.join(args.output_dir, "best-model.pt"))
                        model = model.to(device)
                        best_f1 = f1
                        wait_step = 0
                        stop_training = False
                    else:
                        wait_step += 1
                        if wait_step == args.wait_step:
                            stop_training = True
                    model.train()
            if stop_training:
                break

        logger.info("Training finished!")

    elif args.do_predict:
        if type(model) == list:
            model = [m.eval() for m in model]
        else:
            model.eval()
        f1 = predict(logger,
                     args,
                     model,
                     eval_dataloader,
                     eval_examples,
                     eval_features,
                     device,
                     varying_n_paragraphs=len(args.n_paragraphs) > 1)
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()
    # # 必要参数
    parser.add_argument('--task',
                        default='multi',
                        type=str,
                        help='Task affecting load data and vectorize feature')
    parser.add_argument(
        '--loss_type',
        default='double',
        type=str,
        help='Select loss double or single, only for multi task'
    )  # 针对multi任务才有效
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help=
        "Bert pre-trained model selected in the list: bert-base-uncased,bert-large-uncased, "
        "bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-chinese,"
        "bert-base-multilingual-cased.")  # 选择预训练模型参数
    parser.add_argument("--debug",
                        default=False,
                        help="Whether run on small dataset")  # 正常情况下都应该选择false
    parser.add_argument(
        "--output_dir",
        default="./SQuAD/output/",
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    # # 其他参数
    parser.add_argument("--train_file",
                        default="./SQuAD/version/train.json",
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default="./SQuAD/version/prediction.json",
        type=str,
        help=
        "SQuAD json for predictio ns. E.g., dev-v1.1.json or test-v1.1.json")

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will be "
        "truncated to this length.")

    # # 控制参数
    parser.add_argument("--do_train",
                        default=True,
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=True,
                        help="Whether to run eval on the dev set.")

    parser.add_argument("--train_batch_size",
                        default=18,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=18,
                        type=int,
                        help="Total batch size for predictions.")

    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json file."
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated.This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        help=
        "If true, all of the warnings related to data processing will be printed.A number of "
        "warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.Positive power of 2: static loss scaling value.\n"
    )
    parser.add_argument(
        '--version_2_with_negative',
        default=False,
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )
    args = parser.parse_args()

    # if是采用单机形式,else采用的是分布式形式;因为我们没有分布式系统,所以采用单机多GPU的方式进行训练10.24
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='hierarchical_copy')

    # 以下三句话的意义不是很大,基本操作这一部分是日志的输出形式10.24
    logging.basicConfig(
        format='%(asctime)s-%(levelname)s-%(name)s-%(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info(
        "device:{}, n_gpu:{}, distributed training:{}, 16-bits training:{}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    # 以下几行均是用来设置参数10.24
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    random.seed(args.seed)  # 设置随机种子
    np.random.seed(args.seed)  # 设置随机种子
    torch.manual_seed(args.seed)  # 为CPU设置种子用于生成随机数,以使得结果是确定的
    if n_gpu > 0:  # 如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子
        torch.cuda.manual_seed_all(args.seed)

    # 以下三句又是基本操作,意义不大10.24
    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")
    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    # 以下2句是用来判断output_dir是否存在,若不存在,则创建即可(感觉有这个东西反而不太好,因为需要空文件夹)10.24
    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #     raise ValueError("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # 这个东西是用来干啥的(从tokenization中读取,对Tokenizer进行初始化操作)10.24
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # 从data中读取数据的方式,一种是单队列的读取方式,另一种是多通道读取方式10.24
    if args.task == 'squad':
        read_examples = read_squad_examples
    elif args.task == 'multi':
        read_examples = read_multi_examples

    # 用来加载训练样例以及优化的步骤10.24
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = read_examples(
            input_file=args.train_file,
            is_training=True,
            version_2_with_negative=args.version_2_with_negative)
        if args.debug:
            train_examples = train_examples[:100]
        num_train_optimization_steps = \
            int(len(train_examples)/args.train_batch_size/args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # 模型准备中ing10.24
    model = BertForQuestionAnswering.from_pretrained(
        args.bert_model,
        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                               'distributed_{}'.format(args.local_rank)))

    # model = torch.nn.DataParallel(model).cuda()
    # 判断是否使用float16编码10.24
    if args.fp16:
        # model.half().cuda()
        model.half()
        # 将模型加载到相应的CPU或者GPU中10.24
    model.to(device)

    # 配置优化器等函数10.24
    if args.do_train:
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if args.fp16:
            try:
                # from apex.optimizers import FP16_Optimizer
                from apex.fp16_utils import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=True)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)
        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    # 进行模型的拟合训练10.24
    global_step = 0
    if args.do_train:
        # 训练语料的特征提取
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        all_start_vector = torch.tensor(
            [f.start_vector for f in train_features], dtype=torch.float)
        all_end_vector = torch.tensor([f.end_vector for f in train_features],
                                      dtype=torch.float)
        all_content_vector = torch.tensor(
            [f.content_vector for f in train_features], dtype=torch.float)

        # # 替换的内容all_start_positions以及all_end_positions
        # all1_start_positions = []
        # for i in range(len(train_features)):
        #     for j in range(len(train_features[i].start_position)):
        #         all1_start_positions.append(train_features[i].start_position[j])
        # all_start_positions = torch.tensor([k for k in all1_start_positions], dtype=torch.long)
        # all1_end_positions = []
        # for i in range(len(train_features)):
        #     for j in range(len(train_features[i].end_position)):
        #         all1_end_positions.append(train_features[i].end_position[j])
        # all_end_positions = torch.tensor([k for k in all1_end_positions], dtype=torch.long)
        # ####################################################################

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions, all_start_vector,
                                   all_end_vector, all_content_vector)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)  # 随机采样器
        else:
            train_sampler = DistributedSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for ep in trange(int(args.num_train_epochs), desc="Epoch"):
            # 每次都叫他进行分发,这样的话,就可以进行多GPU训练
            model = torch.nn.DataParallel(model).cuda()
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Iteration",
                         disable=args.local_rank not in [-1, 0])):

                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions, start_vector, end_vector, content_vector = batch

                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions, start_vector,
                             end_vector, content_vector, args.loss_type)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                    print("loss率为:{}".format(loss))
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used and handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            print("\n")
            print(ep)
            output_model_file = os.path.join(args.output_dir,
                                             str(ep) + WEIGHTS_NAME)
            output_config_file = os.path.join(args.output_dir,
                                              str(ep) + CONFIG_NAME)

            torch.save(model.state_dict(), output_model_file)
            if isinstance(model, torch.nn.DataParallel):
                model = model.module
            model.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)

    # 这个是用来加载进行微调调好后的代码以方便进行预测10.25
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    # 再次将GPU加入10.25
    model.to(device)

    # 这部分就是进行相应的预测(用于生成预测文件)
    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = \
            read_examples(input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
        if args.debug:
            eval_examples = eval_examples[:100]
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader,
                desc="Evaluating",
                disable=args.local_rank not in [-1, 0]):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))

        middle_result = os.path.join(args.output_dir, 'middle_result.pkl')
        pickle.dump([eval_examples, eval_features, all_results],
                    open(middle_result, 'wb'))

        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds.json")

        if (args.loss_type == 'double'):
            write_predictions_couple_labeling(
                eval_examples, eval_features, all_results, args.n_best_size,
                args.max_answer_length, args.do_lower_case,
                output_prediction_file, output_nbest_file,
                output_null_log_odds_file, args.verbose_logging,
                args.version_2_with_negative, args.null_score_diff_threshold)
        elif (args.loss_type == 'single'):
            write_predictions_single_labeling(
                eval_examples, eval_features, all_results, args.n_best_size,
                args.max_answer_length, args.do_lower_case,
                output_prediction_file, output_nbest_file,
                output_null_log_odds_file, args.verbose_logging,
                args.version_2_with_negative, args.null_score_diff_threshold)
        elif (args.loss_type == 'origin') or (args.task == 'multi'
                                              and args.loss_type == 'squad'):
            write_predictions(eval_examples, eval_features, all_results,
                              args.n_best_size, args.max_answer_length,
                              args.do_lower_case, output_prediction_file,
                              output_nbest_file, output_null_log_odds_file,
                              args.verbose_logging,
                              args.version_2_with_negative,
                              args.null_score_diff_threshold)
        else:
            raise ValueError('{} dataset and {} loss is not support'.format(
                args.task, args.loss_type))
Exemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default='./data',
                        type=str,
                        help="Data dir containing model dir, etc.")
    parser.add_argument(
        "--tfidf_file",
        default=
        'wiki_first_paras-tfidf-ngram=2-hash=16777216-tokenizer=spacy.npz',
        type=str,
        help="td-idf .npz file placed inside the data_dir.")
    parser.add_argument(
        "--wiki_jsonl",
        default='wiki_firstpara_sents.jsonl',
        type=str,
        help="Processed wikipedia .jsonl placed inside data_dir.")
    parser.add_argument("--qdmr_jsonl",
                        default='./data/qdmr_data/qdmrs_hotpotqa_gold.jsonl',
                        type=str,
                        help="Path to processed qdmr .jsonl file.")
    parser.add_argument("--predict_batch_size",
                        default=128,
                        type=int,
                        help="Batch size for predictions in eval mode.")
    parser.add_argument("--tasks",
                        default='break_rc,ques_ir,break_ir',
                        type=str,
                        help="The IR, RC tasks to perform.")
    parser.add_argument("--suffix",
                        default='gold',
                        type=str,
                        help="Suffix to add to the output files.")
    parser.add_argument("--debug",
                        action='store_true',
                        help="If on, only keep a small number of qdmrs.")
    parser.add_argument(
        "--input_results_file",
        default='',
        type=str,
        help="File containing results of the task to be reused.")
    args = parser.parse_args()

    # we use an already finetuned single-hop RC ensemble by
    # Min et al (https://github.com/shmsw25/DecompRC/tree/master/DecompRC)
    rc_args = {
        'bert_config_file':
        'data/onehop_rc/uncased_L-12_H-768_A-12/bert_config.json',
        'do_lower_case': True,
        'doc_stride': 128,
        'init_checkpoint':
        f'{args.data_dir}/onehop_rc/uncased_L-12_H-768_A-12/model1.pt,{args.data_dir}/onehop_rc/uncased_L-12_H-768_A-12/model2.pt,{args.data_dir}/onehop_rc/uncased_L-12_H-768_A-12/model3.pt',
        'iterations_per_loop': 1000,
        'local_rank': -1,
        'max_answer_length': 30,
        'max_n_answers': 5,
        'max_query_length': 64,
        'max_seq_length': 300,
        'model': 'qa',
        'n_best_size': 4,
        'no_cuda': False,
        'output_dropout_prob': 0,
        'pooling': 'max',
        'seed': 42,
        'verbose_logging': False,
        'vocab_file': 'data/onehop_rc/uncased_L-12_H-768_A-12/vocab.txt',
        'with_key': False
    }
    rc_args = SimpleNamespace(**rc_args)

    # load hotpotQA
    logging.info(f'loading datasets from {args.data_dir}/hotpot_data/ ...')
    data = read_file(f'{args.data_dir}/hotpot_data/hotpot_train_v1.json')
    #data += read_file(f'{args.data_dir}/hotpot_data/hotpot_dev_distractor_v1.json')
    data += read_file(
        f'{args.data_dir}/hotpot_data/hotpot_dev_fullwiki_v1.json')
    for d in data:
        d['gold_titles'] = {x[0] for x in d['supporting_facts']}
    hotpot = {d['_id']: d for d in data}

    # load qdmr data processed using prepare_break.jsonl
    qdmr_path = args.qdmr_jsonl
    logging.info(f'loading processed qdmr data from {qdmr_path} ...')
    qdmrs = read_file(qdmr_path)

    # load spacy
    nlp = en_core_web_sm.load()  # spacy
    tokenize = lambda s: [x.text for x in nlp.tokenizer(s)]

    # load IR
    logging.info('loading IR ...')
    ranker = IR(tfidf_path=f'{args.data_dir}/{args.tfidf_file}')

    # load wikipedia
    wiki_path = f'{args.data_dir}/{args.wiki_jsonl}'
    logging.info(f'loading wikipedia from {wiki_path} ...')
    with jsonlines.open(wiki_path, 'r') as reader:
        wiki = {d['title']: d['para'] for d in tqdm(reader.iter())}

    # prepare and load the RC for inference
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    logging.info(f'{n_gpu} cuda devices available.')
    logging.info('loading 1-hop RC ensemble ...')

    random.seed(rc_args.seed)
    np.random.seed(rc_args.seed)
    torch.manual_seed(rc_args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(rc_args.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=rc_args.vocab_file,
                                           do_lower_case=rc_args.do_lower_case)

    bert_config = BertConfig.from_json_file(rc_args.bert_config_file)
    model = BertForQuestionAnswering(bert_config, 4)

    if rc_args.init_checkpoint is not None:
        model = [model]
        for i, checkpoint in enumerate(rc_args.init_checkpoint.split(',')):
            if i > 0:
                model.append(BertForQuestionAnswering(bert_config, 4))
            logging.info(f"Loading from {checkpoint}")
            state_dict = torch.load(checkpoint, map_location='cpu')
            filter = lambda x: x[7:] if x.startswith('module.') else x
            state_dict = {filter(k): v for (k, v) in state_dict.items()}
            model[-1].load_state_dict(state_dict)
            model[-1].to(device)

    if type(model) == list:
        model = [m.eval() for m in model]
    else:
        model.eval()

    # 1hop RC wrapper
    simpQA = partial(_simpQA,
                     args=args,
                     rc_args=rc_args,
                     tokenizer=tokenizer,
                     model=model,
                     device=device)

    if args.input_results_file:
        logging.info(
            f'Reading the supplied results file {args.input_results_file} ...')
        all_results = read_file(args.input_results_file)
    else:
        all_results = {}
        for i_d, d in enumerate(qdmrs):
            [_, data_split, _id] = d['question_id'].split('_')
            #assert data_split == 'dev'
            assert _id in hotpot
            all_results[_id] = d
            assert d['steps'] and d['op_types'], print(
                'QDMRs must be pre-processed and non-empty.')

    if args.debug:
        all_results = {
            key: val
            for key, val in all_results.items() if random.random() < 0.01
        }
        logging.info(f'\nTruncating to only {len(all_results)} samples!!!\n')

    tasks = [x.strip() for x in args.tasks.split(',')]

    if 'break_rc' in tasks:
        logging.info(f'Running BREAK IR+RC on {len(all_results)} samples ...')
        max_n_parts = max([len(v['steps']) for v in all_results.values()])

        for i_p in range(max_n_parts):
            logging.info(f'Processing qdmr step #{i_p} ...')
            # process the i_p'th part of all samples
            articles = []  # hotpot articles corresponding to queries to the RC
            for _id, v in tqdm(all_results.items()):
                parts = v['steps']
                if i_p >= (len(parts) - int(
                        v['op_types'][-1] in ['COMPARISON', 'INTERSECTION'])):
                    # the last discrete comparison, intersection step is processed later
                    continue
                rc_outputs = v['rc_outputs'] if 'rc_outputs' in v else {}
                nbest_outputs = v[
                    'nbest_outputs'] if 'nbest_outputs' in v else {}
                l_top = v['titles'] if 'titles' in v else []
                part = parts[i_p]
                # replace placeholders with the respective RC outputs of previous parts
                for j in range(i_p):
                    ph = '#' + str(j + 1)  # 1...i_p
                    if ph in part:
                        part = part.replace(ph, rc_outputs[ph])
                # get top 10 titles from IR
                top_titles = ranker.closest_docs(part, k=10)[0]
                l_top.append(top_titles)
                v.update({
                    'titles': l_top,
                    'rc_outputs': rc_outputs,
                    'nbest_outputs': nbest_outputs
                })
                context = []
                # use all retrieved para for the sample instead of just current 10 & sort them acc to similarity wrt part
                set_l_top = set(sum(l_top, []))
                scores = ranker.rank_titles(part, set_l_top)
                sorted_l_top = sorted(scores.keys(),
                                      key=lambda title: scores[title],
                                      reverse=True)
                for title in sorted_l_top:
                    context.append([title, wiki[title]['sents']
                                    ])  # get para from wiki
                if not sorted_l_top:
                    # rare case of no valid titles
                    context = [['Random Title 1', 'Random Text 1'],
                               ['Random Title 2', 'Random Text 2']]
                d, article = hotpot[_id], {}
                article['question'], article[
                    'context'] = part + ' ?', context  # appending '?' to part query
                article.update({
                    k: d[k]
                    for k in ['_id', 'type', 'answer']
                })  # '_id', 'type', 'context', 'question', 'answer'
                articles.append(article)
            if not articles:
                continue
            # querying the 1-hop RC
            all_nbest_out = simpQA([to_squad(article)
                                    for article in articles])[1]
            for _id, v in all_results.items():
                if _id not in all_nbest_out:
                    continue
                nbest_i_p = all_nbest_out[_id]
                op = v['op_types'][i_p]
                nbest_id = v['nbest_outputs']
                # handle filter steps
                if 'FILTER' in op:
                    ref_ph = op.split('_')[1]
                    nbest_ref = Counter(nbest_id[ref_ph])
                    # accumulating the logits of nbest of the part and the ref part
                    nbest_ref.update(nbest_i_p)
                    nbest_i_p = dict(nbest_ref)
                rc_out = max(nbest_i_p.keys(), key=lambda key: nbest_i_p[key])
                v['rc_outputs'][f'#{i_p+1}'] = rc_out
                v['nbest_outputs'][f'#{i_p+1}'] = nbest_i_p

        # discrete processing of the last comparison step
        logging.info(
            f'Discrete processing of the last comparison/intersection steps ...'
        )
        for _id, v in all_results.items():
            if v['op_types'][-1] not in ['COMPARISON', 'INTERSECTION']:
                continue
            question, answer, gold_titles = hotpot[_id]['question'], hotpot[
                _id]['answer'], hotpot[_id]['gold_titles']
            parts, rc_outputs = v['steps'], v['rc_outputs']
            if v['op_types'][-1] == 'COMPARISON':
                ents, rc_outs = [], []
                for i_p, part in enumerate(parts[:-1]):
                    # get named entity in the part
                    part_without_phs = part
                    for x in ['#' + str(j) for j in range(1, 8)]:
                        part_without_phs = part_without_phs.replace(x, '')
                    ent = get_ent(part_without_phs, nlp, only_longest=True)
                    ent = '' if ent is None else ent
                    ents.append(ent)
                    rc_outs.append(
                        normalize_answer(rc_outputs['#' + str(i_p + 1)]))
                if 'same as' in parts[-1]:
                    pred_ans = 'yes' if rc_outs[-2] == rc_outs[-1] else 'no'
                else:
                    pred_ans = ents[compare(parts[-1], rc_outs[-2],
                                            rc_outs[-1])]
                    v['rc_outputs'][f'#{len(parts)}'] = pred_ans
            elif v['op_types'][-1] == 'INTERSECTION':
                part = parts[-1]
                phs = [
                    '#' + str(j) for j in range(1, 10) if '#' + str(j) in part
                ]
                phs = list(set(phs))
                # accumulate logits of the parts and take the argmax
                nbest_id = v['nbest_outputs']
                nbest = Counter(nbest_id[phs[0]])
                # accumulate logits
                for ph in phs[1:]:
                    if ph in nbest_id:
                        nbest.update(nbest_id[ph])
                nbest = dict(nbest)
                pred_ans = max(nbest.keys(), key=lambda key: nbest[key])
                v['rc_outputs'][f'#{len(parts)}'] = pred_ans
                v['nbest_outputs'][f'#{len(parts)}'] = nbest

        for v in all_results.values():
            assert len(v['rc_outputs']) == len(v['steps'])

    if 'break_ir' in tasks:
        # this can only be run after break_rc task & requires all_results dict
        logging.info(
            f'Forming context using the titles used by Break RC for {len(all_results)} samples ...'
        )
        # prepare hotpot-like data for Bert RC
        new_hotpot = []
        for _id, v in tqdm(all_results.items()):
            d = hotpot[_id]
            d_new = deepcopy(d)
            used_titles = sum(v['titles'], [])
            # sort wrt similarity to ques
            scores = ranker.rank_titles(d['question'], set(used_titles))
            titles = sorted(scores.keys(),
                            key=lambda title: scores[title],
                            reverse=True)
            context = []
            for title in titles:
                context.append([title, wiki[title]['sents']])
            d_new['context'] = context
            if 'gold_titles' in d_new:
                del d_new['gold_titles']
            new_hotpot.append(d_new)
        out_break_ir_file = f'{args.data_dir}/hotpot_data/hotpot_after_break_ir_{args.suffix}.json'
        logging.info(
            f'Writing hotpot version with the Break IR context to {out_break_ir_file} ...'
        )
        write_file(new_hotpot, out_break_ir_file)

        # store the retrieved titles
        for d in new_hotpot:
            all_results[d['_id']]['titles_found_by_break_rc'] = list(
                set([x[0] for x in d['context']]))

    if 'ques_ir' in tasks:
        # this can only be run after break_rc task & requires all_results dict formed
        # to determine the number of titles to be retrieved for each sample
        logging.info(
            f'Running baseline IR using the whole question for {len(all_results)} samples ...'
        )
        # prepare hotpot-like data for Bert RC
        new_hotpot = []
        for _id in tqdm(all_results.keys()):
            d = hotpot[_id]
            d_new = deepcopy(d)
            # for fair comparison with Break RC retrieve the same number of titles
            n_titles = len(sum(all_results[_id]['titles'], []))
            titles = ranker.closest_docs(d['question'], k=n_titles)[0]
            context = []
            for title in titles:
                context.append([title, wiki[title]['sents']])
            d_new['context'] = context
            if 'gold_titles' in d_new:
                del d_new['gold_titles']
            new_hotpot.append(d_new)
        out_ques_ir_file = f'{args.data_dir}/hotpot_data/hotpot_after_ques_ir_{args.suffix}.json'
        logging.info(
            f'Writing hotpot version with the baseline IR context to {out_ques_ir_file} ...'
        )
        write_file(new_hotpot, out_ques_ir_file)

        # store the retrieved titles
        for d in new_hotpot:
            all_results[d['_id']]['titles_found_using_whole_ques'] = list(
                set([x[0] for x in d['context']]))

    # save the Break RC outputs
    out_break_rc_file = f'{args.data_dir}/predictions/break_rc_results_{args.suffix}.json'
    logging.info(f'Writing the break RC results to {out_break_rc_file}...')
    os.makedirs(dirname(out_break_rc_file), exist_ok=True)
    write_file(all_results, out_break_rc_file)
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--optimize_on_cpu',
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU"
    )

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_squad_examples(input_file=args.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    model = BertForQuestionAnswering(bert_config)
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))

    if not args.optimize_on_cpu:
        model.to(device)
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [{
        'params':
        [p for n, p in model.named_parameters() if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in model.named_parameters() if n in no_decay],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.optimize_on_cpu:
                        model.to('cpu')
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    if args.optimize_on_cpu:
                        model.to(device)
                    global_step += 1

    if args.do_predict:
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, args.verbose_logging)
Exemplo n.º 17
0
    device = "cpu"
    model.to(device)


def question_answering(doc_text, question_text, model, tokenizer):
    example = compose_question(doc_text, question_text)
    predictions = predict_answer(example, model, tokenizer, top_n=3)
    results = []
    for p in predictions:
        results.append((p.text, str((p.start_logit + p.end_logit) / 2)))
    return results


def question_answering_terminal():
    doc_text = input('Input document String: ')
    question_text = input('Input question String: ')
    example = compose_question(doc_text, question_text)
    predictions = predict_answer(example, top_n=3)
    results = []
    for p in predictions:
        results.append((p.text, str((p.start_logit + p.end_logit) / 2)))
    return results


if __name__ == "__main__":
    model = BertForQuestionAnswering.from_pretrained("./bert_squad/")
    tokenizer = BertTokenizer.from_pretrained("./bert_squad/",
                                              do_lower_case=True)
    device = "cpu"
    model.to(device)
    print(question_answering_terminal())
Exemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--bert_config_file", default='model_repo/uncased_L-12_H-768_A-12/bert_config.json', type=str,
                        help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default='model_repo/uncased_L-12_H-768_A-12/vocab.txt', type=str,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default='output', type=str,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--processed_data", default='processed', type=str,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.")
    parser.add_argument("--do_predict", default=True, action='store_true',
                        help="Whether to run eval on the dev set.")

    ## Other parameters
    parser.add_argument("--train_file", default='BioASQ-train-factoid-4b.json', type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument("--predict_file", default='ASQdev.json', type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--init_checkpoint", default='model_repo/uncased_L-12_H-768_A-12/pytorch_model.bin', type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--finetuned_checkpoint", default='ft_dir/ft_model.bin', type=str,
                        help="finetuned checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--do_lower_case", default=True, action='store_true',
                        help="Whether to lower case the input text. Should be True for uncased "
                             "models and False for cased models.")
    parser.add_argument("--max_seq_length", default=500, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--doc_stride", default=128, type=int,
                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
    parser.add_argument("--max_answer_length", default=500, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=5.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
                             "of training.")
    parser.add_argument("--save_checkpoints_steps", default=1000, type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop", default=1000, type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument("--n_best_size", default=20, type=int,
                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
                             "output file.")
    parser.add_argument("--verbose_logging", default=False, action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--accumulate_gradients",
                        type=int,
                        default=1,
                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
                            args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError("At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    if not os.path.exists(args.finetuned_checkpoint):
        os.makedirs(args.finetuned_checkpoint, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    num_train_steps = None
    if args.do_train:
        logger.info('Load and process train examples')
        if os.path.exists(os.path.join(args.processed_data, 'processed_train.pkl')):
            with open(os.path.join(args.processed_data, 'processed_train.pkl'), 'rb') as f:
                train_features, train_examples, num_train_steps = pickle.load(f)
        else:
            train_examples = read_squad_examples(
                input_file=args.train_file, tokenizer=tokenizer, is_training=True)
            num_train_steps = int(
                len(train_examples) / args.train_batch_size * args.num_train_epochs)
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            train_path = os.path.join(args.processed_data, 'processed_train.pkl')
            with open(train_path, 'wb') as f:
                pickle.dump([train_features, train_examples, num_train_steps], f)

    if args.do_predict:
        logger.info('Load and process dev examples')
        if os.path.exists(os.path.join(args.processed_data, 'processed_dev.pkl')):
            with open(os.path.join(args.processed_data, 'processed_dev.pkl'), 'rb') as f:
                eval_features, eval_examples = pickle.load(f)
        else:
            eval_examples = read_squad_examples(
                input_file=args.predict_file, tokenizer=tokenizer, is_training=False)
            eval_features = convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=False)
            eval_path = os.path.join(args.processed_data, 'processed_dev.pkl')
            with open(eval_path, 'wb') as f:
                pickle.dump([eval_features, eval_examples], f)

    model = BertForQuestionAnswering(bert_config)
    if args.do_train and args.init_checkpoint is not None:
        logger.info('Loading init checkpoint')
        model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
        logger.info('Loaded init checkpoint')
    elif args.do_predict:
        logger.info('Loading fine-tuned checkpoint')
        state_dict = torch.load(args.finetuned_checkpoint, map_location='cpu')
        new_state_dict = collections.OrderedDict()
        for key, value in state_dict.items():
            new_state_dict[key[7:]] = value
        model.load_state_dict(new_state_dict)
        del state_dict
        del new_state_dict
        logger.info('Loaded fine-tuned checkpoint')
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [
        {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
        ]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                   all_start_positions, all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        best_dev_score = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()    # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

                if (step + 1) % args.save_checkpoints_steps == 0:
                    best_dev_score = run_evaluate(args, model, eval_features, device, eval_examples, tokenizer,
                                                  best_dev_score)
                    logger.info('Best dev score {} in steps {}:'.format(best_dev_score, step))

    if args.do_predict:
        run_evaluate(args, model, eval_features, device, eval_examples, tokenizer, best_dev_score )
Exemplo n.º 19
0
def load_model(bert_model, model_file, **kwargs):
    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(model_file)
    model = BertForQuestionAnswering.from_pretrained(
        bert_model, state_dict=model_state_dict, **kwargs)
    return model
Exemplo n.º 20
0
def init_params():
    model = BertForQuestionAnswering.from_pretrained("./bert_squad/")
    tokenizer = BertTokenizer.from_pretrained("./bert_squad/",
                                              do_lower_case=True)
    device = "cpu"
    model.to(device)
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser()
    BERT_DIR = "./model/uncased_L-12_H-768_A-12/"
    ## Required parameters
    parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \
                        type=str, help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default="out", type=str, \
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--train_file", type=str, \
                        help="SQuAD json for training. E.g., train-v1.1.json", \
                        default="")
    parser.add_argument("--predict_file", type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \
                        default="")
    parser.add_argument("--init_checkpoint", type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).", \
                        default=BERT_DIR+"pytorch_model.bin")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=300,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=128,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=10.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=3,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")

    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument('--eval_period', type=int, default=2000)

    parser.add_argument('--max_n_answers', type=int, default=5)
    parser.add_argument('--merge_query', type=int, default=-1)
    parser.add_argument('--reduce_layers', type=int, default=-1)
    parser.add_argument('--reduce_layers_to_tune', type=int, default=-1)

    parser.add_argument('--only_comp', action="store_true", default=False)

    parser.add_argument('--train_subqueries_file', type=str, default="")  #500
    parser.add_argument('--predict_subqueries_file', type=str,
                        default="")  #500
    parser.add_argument('--prefix', type=str, default="")  #500

    parser.add_argument('--model', type=str, default="qa")  #500
    parser.add_argument('--pooling', type=str, default="max")
    parser.add_argument('--debug', action="store_true", default=False)
    parser.add_argument('--output_dropout_prob', type=float, default=0)
    parser.add_argument('--wait_step', type=int, default=30)
    parser.add_argument('--with_key', action="store_true", default=False)
    parser.add_argument('--add_noise', action="store_true", default=False)

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
        if not args.predict_file:
            raise ValueError(
                "If `do_train` is True, then `predict_file` must be specified."
            )

    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.do_train and args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        logger.info("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None

    eval_dataloader, eval_examples, eval_features, _ = get_dataloader(
        logger=logger,
        args=args,
        input_file=args.predict_file,
        subqueries_file=args.predict_subqueries_file,
        is_training=False,
        batch_size=args.predict_batch_size,
        num_epochs=1,
        tokenizer=tokenizer)
    if args.do_train:
        train_dataloader, train_examples, _, num_train_steps = get_dataloader(
                logger=logger, args=args, \
                input_file=args.train_file, \
                subqueries_file=args.train_subqueries_file, \
                is_training=True,
                batch_size=args.train_batch_size,
                num_epochs=args.num_train_epochs,
                tokenizer=tokenizer)

    #a = input()
    if args.model == 'qa':
        model = BertForQuestionAnswering(bert_config, 4)
        metric_name = "F1"
    elif args.model == 'classifier':
        if args.reduce_layers != -1:
            bert_config.num_hidden_layers = args.reduce_layers
        model = BertClassifier(bert_config, 2, args.pooling)
        metric_name = "F1"
    elif args.model == "span-predictor":
        if args.reduce_layers != -1:
            bert_config.num_hidden_layers = args.reduce_layers
        if args.with_key:
            Model = BertForQuestionAnsweringWithKeyword
        else:
            Model = BertForQuestionAnswering
        model = Model(bert_config, 2)
        metric_name = "Accuracy"
    else:
        raise NotImplementedError()

    if args.init_checkpoint is not None and args.do_predict and \
                len(args.init_checkpoint.split(','))>1:
        assert args.model == "qa"
        model = [model]
        for i, checkpoint in enumerate(args.init_checkpoint.split(',')):
            if i > 0:
                model.append(BertForQuestionAnswering(bert_config, 4))
            print("Loading from", checkpoint)
            state_dict = torch.load(checkpoint, map_location='cpu')
            filter = lambda x: x[7:] if x.startswith('module.') else x
            state_dict = {filter(k): v for (k, v) in state_dict.items()}
            model[-1].load_state_dict(state_dict)
            model[-1].to(device)

    else:
        if args.init_checkpoint is not None:
            print("Loading from", args.init_checkpoint)
            state_dict = torch.load(args.init_checkpoint, map_location='cpu')
            if args.reduce_layers != -1:
                state_dict = {k:v for k, v in state_dict.items() \
                    if not '.'.join(k.split('.')[:3]) in \
                    ['encoder.layer.{}'.format(i) for i in range(args.reduce_layers, 12)]}
            if args.do_predict:
                filter = lambda x: x[7:] if x.startswith('module.') else x
                state_dict = {filter(k): v for (k, v) in state_dict.items()}
                model.load_state_dict(state_dict)
            else:
                model.bert.load_state_dict(state_dict)
                if args.reduce_layers_to_tune != -1:
                    model.bert.embeddings.required_grad = False
                    n_layers = 12 if args.reduce_layers == -1 else args.reduce_layers
                    for i in range(n_layers - args.reduce_layers_to_tune):
                        model.bert.encoder.layer[i].require_grad = False

        model.to(device)

        if args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[args.local_rank],
                output_device=args.local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

    if args.do_train:
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [{
            'params':
            [p for n, p in model.named_parameters() if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in model.named_parameters() if n in no_decay],
            'weight_decay_rate':
            0.0
        }]

        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

        global_step = 0

        best_f1 = 0
        wait_step = 0
        model.train()
        global_step = 0
        stop_training = False

        for epoch in range(int(args.num_train_epochs)):
            for step, batch in tqdm(enumerate(train_dataloader)):
                global_step += 1
                batch = [t.to(device) for t in batch]
                loss = model(batch, global_step)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if global_step % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                if global_step % args.eval_period == 0:
                    model.eval()
                    f1 =  predict(args, model, eval_dataloader, eval_examples, eval_features, \
                                  device, write_prediction=False)
                    logger.info("%s: %.3f on epoch=%d" %
                                (metric_name, f1 * 100.0, epoch))
                    if best_f1 < f1:
                        logger.info("Saving model with best %s: %.3f -> %.3f on epoch=%d" % \
                                (metric_name, best_f1*100.0, f1*100.0, epoch))
                        model_state_dict = {
                            k: v.cpu()
                            for (k, v) in model.state_dict().items()
                        }
                        torch.save(
                            model_state_dict,
                            os.path.join(args.output_dir, "best-model.pt"))
                        model = model.cuda()
                        best_f1 = f1
                        wait_step = 0
                        stop_training = False
                    else:
                        wait_step += 1
                        if best_f1 > 0.1 and wait_step == args.wait_step:
                            stop_training = True
                    model.train()
            if stop_training:
                break

    elif args.do_predict:
        if type(model) == list:
            model = [m.eval() for m in model]
        else:
            model.eval()
        f1 = predict(args, model, eval_dataloader, eval_examples,
                     eval_features, device)
        logger.info("Final %s score: %.3f%%" % (metric_name, f1 * 100.0))
Exemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_evaluate",
                        action='store_true',
                        help="Whether to eval after training.")
    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--num_train_samples",
                        default=-1,
                        type=int,
                        help="Total number of training samples used.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument(
        '--version_2_with_negative',
        action='store_true',
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--eval_period', type=int, default=2000)
    parser.add_argument('--wait_step', type=int, default=7)
    parser.add_argument('--load_from_cache',
                        action='store_true',
                        help="Load train features from cache.")
    parser.add_argument('--indiv_digits',
                        action='store_true',
                        help="Tokenize numbers into indiv digits.")
    parser.add_argument('--use_segment_ids',
                        action='store_true',
                        help="Use segment ids.")

    args = parser.parse_args()
    #print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        make_output_dir(
            args, scripts_to_save=[sys.argv[0], 'run_squad_dataset_utils.py'])
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )
        with open(args.predict_file, encoding='utf-8') as pf:
            dev_data = json.load(pf)["data"]

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab
    if args.do_train:
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)
        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
    elif args.do_evaluate:
        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    if args.local_rank == 0:
        torch.distributed.barrier()

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter(os.path.join(args.output_dir,
                                                   'log'))  # tensorboard
        # Prepare data loader
        train_examples = read_squad_examples(
            input_file=args.train_file,
            is_training=True,
            version_2_with_negative=args.version_2_with_negative,
            num_train_samples=args.num_train_samples)
        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}'.format(
            list(filter(None, args.bert_model.split('/'))).pop(),
            str(args.max_seq_length), str(args.doc_stride),
            str(args.max_query_length), str(args.num_train_samples))
        try:
            if args.load_from_cache:
                with open(cached_train_features_file, "rb") as reader:
                    train_features = pickle.load(reader)
            else:
                raise Exception
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True,
                indiv_digits=args.indiv_digits)
            if args.local_rank == -1:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        num_train_optimization_steps = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

    if args.do_predict and args.local_rank == -1:
        eval_examples = read_squad_examples(
            input_file=args.predict_file,
            is_training=False,
            version_2_with_negative=args.version_2_with_negative)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False,
            indiv_digits=args.indiv_digits)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

    if args.do_train:
        best_f1 = 0
        wait_step = 0
        global_step = 0
        stop_training = False
        do_eval = False
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            if stop_training:
                break
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Iteration",
                         disable=args.local_rank not in [-1, 0])):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                segment_ids = segment_ids if args.use_segment_ids else None
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if global_step % args.eval_period == 0:
                    do_eval = True

                if do_eval:
                    do_eval = False
                    model.eval()
                    scores = predict(args, model, eval_examples, eval_features,
                                     eval_dataloader, dev_data, device)
                    em, f1 = scores['exact_match'], scores['f1']
                    logger.info("f1: %.3f, em: %.3f on epoch=%d" %
                                (f1, em, epoch))
                    print("f1: %.3f, em: %.3f on epoch=%d" % (f1, em, epoch))
                    if best_f1 < f1:
                        logger.info("Saving model with best f1: %.3f -> %.3f on epoch=%d" % \
                                (best_f1, f1, epoch))
                        save_model(args, model, tokenizer)
                        model.to(device)
                        best_f1 = f1
                        wait_step = 0
                        stop_training = False
                    else:
                        wait_step += 1
                        if best_f1 > 86 and wait_step == args.wait_step:
                            stop_training = True
                            break
                    model.train()
            # end of epoch
            do_eval = True

    elif args.do_evaluate:
        model.eval()
        scores = predict(args, model, eval_examples, eval_features,
                         eval_dataloader, dev_data, device, True)
        em, f1 = scores['exact_match'], scores['f1']
        print("f1: %.3f, em: %.3f" % (f1, em))