Пример #1
0
def process_inputs(input_data):
    bert_config = modeling.BertConfig.from_json_file(rs.FLAGS.bert_config_file)

    eval_examples = read_squad_data(input_data, is_training=False)
    eval_features = []

    eval_writer = rs.FeatureWriter(filename=os.path.join(
        "./colab_output", "train.tf_record"),
                                   is_training=False)

    def append_feature(feature):
        eval_features.append(feature)
        eval_writer.process_feature(feature)

    # 토크나이저에 사전과 do_lower_case 설정
    tokenizer = tokenization.FullTokenizer(
        vocab_file=rs.FLAGS.vocab_file, do_lower_case=rs.FLAGS.do_lower_case)

    rs.convert_examples_to_features(examples=eval_examples,
                                    tokenizer=tokenizer,
                                    max_seq_length=rs.FLAGS.max_seq_length,
                                    doc_stride=rs.FLAGS.doc_stride,
                                    max_query_length=rs.FLAGS.max_query_length,
                                    is_training=False,
                                    output_fn=append_feature)
    eval_writer.close()

    return eval_examples, eval_features
Пример #2
0
def get_answer(data):

    eval_examples = get_squad_examples(data, is_training=False)
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    run_squad.convert_examples_to_features(examples=eval_examples,
                                           tokenizer=tokenizer,
                                           max_seq_length=max_seq_length,
                                           doc_stride=DOC_STRIDE,
                                           max_query_length=MAX_QUERY_LENGTH,
                                           is_training=False,
                                           output_fn=append_feature)

    global answer_model
    if answer_model == None:
        answer_model = Model(export_dir)

    all_results = answer_model.predict(eval_features=eval_features)
    pred = get_predicted_answer(eval_examples, eval_features, all_results,
                                N_BEST_SIZE, MAX_ANSWER_LENGTH, DO_LOWER_CASE)
    temp = dict(pred.items())
    return temp['1']
Пример #3
0
    def my_create_examples(self, data_object):
        """
        Modified version of read_squad_examples from run_squad.
        Note that this returns feature objects, not example objects. The feature TENSORS themselves are made elsewhere.
        :param data_object: equivalent object to the 'data' section of the SQuAD JSON scheme
        :return: a list of `SquadExample`s
        """
        def is_whitespace(c):
            return c in " \t\r\n" or ord(c) == 0x202F

        examples = []
        for entry in data_object:
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                doc_tokens = []
                char_to_word_offset = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                    char_to_word_offset.append(len(doc_tokens) - 1)

                for qa in paragraph["qas"]:
                    examples.append(run_squad.SquadExample(
                        qas_id=qa["id"],
                        question_text=qa["question"],
                        doc_tokens=doc_tokens,
                        orig_answer_text=None,
                        start_position=None,
                        end_position=None,
                        is_impossible=False)
                    )

        feature_objects = []
        run_squad.convert_examples_to_features(
            examples=examples,
            tokenizer=self.tokenizer,
            max_seq_length=self.flags.max_seq_length,
            doc_stride=self.flags.doc_stride,
            max_query_length=self.flags.max_query_length,
            is_training=False,
            output_fn=feature_objects.append)
        return feature_objects
Пример #4
0
    def response(self, data):
        # data = [[context, question], ...]
        eval_examples = self.process_example(data)

        eval_writer = FeatureWriter(filename=os.path.join(
            self.output_dir, "eval.tf_record"),
                                    is_training=False)
        eval_features = []

        def append_feature(feature):
            eval_features.append(feature)
            eval_writer.process_feature(feature)

        convert_examples_to_features(examples=eval_examples,
                                     tokenizer=self.tokenizer,
                                     max_seq_length=FLAGS.max_seq_length,
                                     doc_stride=FLAGS.doc_stride,
                                     max_query_length=FLAGS.max_query_length,
                                     is_training=False,
                                     output_fn=append_feature)
        eval_writer.close()

        all_results = []

        predict_input_fn = input_fn_builder(input_file=eval_writer.filename,
                                            seq_length=FLAGS.max_seq_length,
                                            is_training=False,
                                            drop_remainder=False)

        # If running eval on the TPU, you will need to specify the number of steps.
        all_results = []
        for result in self.estimator.predict(predict_input_fn,
                                             yield_single_examples=True):
            if len(all_results) % 1000 == 0:
                tf.logging.info("Processing example: %d" % (len(all_results)))
            unique_id = int(result["unique_ids"])
            start_logits = [float(x) for x in result["start_logits"].flat]
            end_logits = [float(x) for x in result["end_logits"].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        predictions = self.predict(eval_examples, eval_features, all_results,
                                   FLAGS.n_best_size, FLAGS.max_answer_length,
                                   FLAGS.do_lower_case)

        return predictions
Пример #5
0
def mrc():
    data_from_post = getData()
    data = preprocess_data(data_from_post)

    eval_writer = mainfile.FeatureWriter(filename=os.path.join(
        mrc_inference_config["output_dir"], "eval.tf_record"),
                                         is_training=False)
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)
        eval_writer.process_feature(feature)

    mainfile.convert_examples_to_features(
        examples=data,
        tokenizer=tokenizer,
        max_seq_length=mrc_inference_config["max_seq_length"],
        doc_stride=mrc_inference_config["doc_stride"],
        max_query_length=mrc_inference_config["max_query_length"],
        is_training=False,
        output_fn=append_feature)
    eval_writer.close()

    all_results = []

    predict_input_fn = mainfile.input_fn_builder(
        input_file=eval_writer.filename,
        seq_length=mrc_inference_config["max_seq_length"],
        is_training=False,
        drop_remainder=False)

    all_results = []
    for result in estimator.predict(predict_input_fn,
                                    yield_single_examples=True):
        unique_id = int(result["unique_ids"])
        start_logits = [float(x) for x in result["start_logits"].flat]
        end_logits = [float(x) for x in result["end_logits"].flat]
        all_results.append(
            mainfile.RawResult(unique_id=unique_id,
                               start_logits=start_logits,
                               end_logits=end_logits))

    answer = mainfile.write_predictions(
        data, eval_features, all_results, 20,
        mrc_inference_config["max_answer_length"], True, None, None, None)
    return sendResponse({"Answer": answer.get(data_from_post.get("qas_id"))})
Пример #6
0
def process_data_and_get_input_max_min(data_list,
                                       fixer,
                                       input_tensor_names,
                                       num_runs,
                                       vocab_file,
                                       do_lower_case,
                                       seq_length,
                                       doc_stride=128,
                                       max_query_length=64,
                                       batch_size=8,
                                       preprocess_fn="default_preprocess"):
    """Precess input data and get input max and min.
  """
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    eval_examples = read_squad_examples(input_file=data_list,
                                        is_training=False)
    eval_examples = eval_examples[0:batch_size * num_runs]
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)

    convert_examples_to_features(examples=eval_examples,
                                 tokenizer=tokenizer,
                                 max_seq_length=seq_length,
                                 doc_stride=doc_stride,
                                 max_query_length=max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)
    input_dicts = []
    input_node_names = [
        node_name.split(':')[0] for node_name in input_tensor_names
    ]
    for i in range(num_runs):
        inputs = process_feature_batch(eval_features, batch_size, i)
        input_dict = dict(zip(input_node_names, inputs))
        input_dicts.append(input_dict)
    fixer.get_input_max_min(input_dicts, batch_size)

    print("quantize input end")
Пример #7
0
    def __init__(
        self,
        eval_script: str = "data/squad/v1.1/evaluate-v1.1.py",
        predict_file: str = "",
        output_dir: str = "./",
        n_best_size: int = 20,
        max_answer_length: int = 30,
        version_2_with_negative: bool = False,
        max_seq_length: int = 384,
        doc_stride: int = 128,
        max_query_length: int = 64,
        vocab_file: str = "",
        do_lower_case: bool = True,
        max_len: int = 512,
    ):

        tokenizer = BertTokenizer(vocab_file,
                                  do_lower_case=do_lower_case,
                                  max_len=max_len)  # for bert large

        self.eval_examples = read_squad_examples(
            input_file=predict_file,
            is_training=False,
            version_2_with_negative=version_2_with_negative)

        self.eval_features = convert_examples_to_features(
            examples=self.eval_examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=False,
        )

        self.output_dir = output_dir
        self.eval_script = eval_script
        self.predict_file = predict_file

        args = Namespace(
            version_2_with_negative=version_2_with_negative,
            n_best_size=n_best_size,
            max_answer_length=max_answer_length,
            verbose_logging=False,
            do_lower_case=do_lower_case,
        )

        self.args = args

        self.all_results: List[RawResult] = []
def get_dataloader(args):
    ''' return dataloader for inference '''
    
    # Preprocess input data
    tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
    
    cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride)
    try:
        with open(cached_features_file, "rb") as reader:
            eval_features = pickle.load(reader)
    except:
        eval_examples = read_squad_examples(
            input_file=args.predict_file,
            is_training=False,
            version_2_with_negative=args.version_2_with_negative)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)
        with open(cached_features_file, "wb") as writer:
            pickle.dump(eval_features, writer)
    
    data = []
    for feature in eval_features:
        input_ids = torch.tensor(feature.input_ids, dtype=torch.int64)
        input_mask = torch.tensor(feature.input_mask, dtype=torch.int64)
        segment_ids = torch.tensor(feature.segment_ids, dtype=torch.int64)
        inp = (input_ids, segment_ids, input_mask)
        data.append(inp)
    
    if args.nbatches > 0:
        data = data[:args.nbatches*args.batch_size]
    
    test_loader = torch.utils.data.DataLoader(
        data, 
        batch_size=args.batch_size, 
        shuffle=False, 
        num_workers=1, 
        pin_memory=True)
    
    return test_loader
def _validate_squad(args, model, tokenizer):
    eval_examples = run_squad.read_squad_examples(
        input_file=args.predict_file,
        is_training=False,
        version_2_with_negative=args.version_2_with_negative)

    eval_features = run_squad.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=False)

    run_squad.logger.info("***** Running predictions *****")
    run_squad.logger.info("  Num orig examples = %d", len(eval_examples))
    run_squad.logger.info("  Num split examples = %d", len(eval_features))
    run_squad.logger.info("  Batch size = %d", args.predict_batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = run_squad.TensorDataset(all_input_ids, all_input_mask,
                                        all_segment_ids, all_example_index)
    # Run prediction for full data
    eval_sampler = run_squad.SequentialSampler(eval_data)
    eval_dataloader = run_squad.DataLoader(eval_data,
                                           sampler=eval_sampler,
                                           batch_size=args.predict_batch_size)

    model.eval()
    all_results = []
    run_squad.logger.info("Start evaluating")
    for input_ids, input_mask, segment_ids, example_indices in run_squad.tqdm(
            eval_dataloader, desc="Evaluating"):
        if len(all_results) % 1000 == 0:
            run_squad.logger.info("Processing example: %d" %
                                  (len(all_results)))
        input_ids = input_ids.cuda()
        input_mask = input_mask.cuda()
        segment_ids = segment_ids.cuda()
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(
                input_ids, segment_ids, input_mask)
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(
                run_squad.RawResult(unique_id=unique_id,
                                    start_logits=start_logits,
                                    end_logits=end_logits))
    output_prediction_file = os.path.join("predictions.json")
    output_nbest_file = os.path.join("nbest_predictions.json")
    output_null_log_odds_file = os.path.join("null_odds.json")
    run_squad.write_predictions(
        eval_examples, eval_features, all_results, args.n_best_size,
        args.max_answer_length, args.do_lower_case, output_prediction_file,
        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
        args.version_2_with_negative, args.null_score_diff_threshold)

    result = _calc_metric_squad(args.predict_file, output_prediction_file)
    os.remove(output_prediction_file)
    os.remove(output_nbest_file)
    os.remove(output_null_log_odds_file)
    return result  # {'exact_match': exact_match, 'f1': f1}
def _train_squad(args, stage):
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    _set_seed(args.seed)

    tokenizer = run_squad.BertTokenizer(args.vocab_file,
                                        do_lower_case=args.do_lower_case,
                                        max_len=512)  # for bert large
    # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None

    train_examples = run_squad.read_squad_examples(
        input_file=args.train_file,
        is_training=True,
        version_2_with_negative=args.version_2_with_negative)
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs

    config = run_squad.BertConfig.from_json_file(args.config_file)
    model: nn.Module = run_squad.BertForQuestionAnswering(config)
    _load_checkpoint(model, args.init_checkpoint)

    if stage == PruningPhase.admm:
        _hard_mask(model, args.sparsity_config)

    model.cuda()
    if args.fp16 and args.old:
        model.half()

    with open(args.sparsity_config, 'r') as f:
        raw_dict = yaml.load(f, Loader=yaml.SafeLoader)
        masks = dict.fromkeys(raw_dict['prune_ratios'].keys())

    plain_model = getattr(model, 'module', model)

    for param_name in masks:
        param = get_parameter_by_name(plain_model, param_name)
        if param is None: raise Exception(f'Cannot find {param_name}')
        non_zero_mask = torch.ne(param, 0).to(param.dtype)
        masks[param_name] = non_zero_mask

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            # from fused_adam_local import FusedAdamBert as FusedAdam
            from apex.optimizers import FusedAdam
            from apex.fp16_utils.fp16_optimizer import FP16_Optimizer
            # from apex.contrib.optimizers import FP16_Optimizer
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        # import ipdb; ipdb.set_trace()
        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)

        if args.loss_scale == 0:
            if args.old:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level="O2",
                                                  keep_batchnorm_fp32=False,
                                                  loss_scale="dynamic")
        else:
            if args.old:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            else:
                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level="O2",
                                                  keep_batchnorm_fp32=False,
                                                  loss_scale=args.loss_scale)
        if not args.old and args.do_train:
            scheduler = run_squad.LinearWarmUpScheduler(
                optimizer,
                warmup=args.warmup_proportion,
                total_steps=num_train_optimization_steps)

    else:
        optimizer = run_squad.BertAdam(optimizer_grouped_parameters,
                                       lr=args.learning_rate,
                                       warmup=args.warmup_proportion,
                                       t_total=num_train_optimization_steps)

    model = torch.nn.DataParallel(model)

    global_step = 0
    cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
        list(filter(None, args.bert_model.split('/'))).pop(),
        str(args.max_seq_length), str(args.doc_stride),
        str(args.max_query_length))
    # train_features = None
    try:
        with open(cached_train_features_file, "rb") as reader:
            train_features = pickle.load(reader)
    except:
        train_features = run_squad.convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        if args.local_rank == -1 or torch.distributed.get_rank() == 0:
            run_squad.logger.info(
                "  Saving train features into cached file %s",
                cached_train_features_file)
            with open(cached_train_features_file, "wb") as writer:
                pickle.dump(train_features, writer)

    run_squad.logger.info("***** Running training *****")
    run_squad.logger.info("  Num orig examples = %d", len(train_examples))
    run_squad.logger.info("  Num split examples = %d", len(train_features))
    run_squad.logger.info("  Batch size = %d", args.train_batch_size)
    run_squad.logger.info("  Num steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_start_positions = torch.tensor(
        [f.start_position for f in train_features], dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in train_features],
                                     dtype=torch.long)
    train_data = run_squad.TensorDataset(all_input_ids, all_input_mask,
                                         all_segment_ids, all_start_positions,
                                         all_end_positions)
    train_sampler = run_squad.RandomSampler(train_data)
    train_dataloader = run_squad.DataLoader(train_data,
                                            sampler=train_sampler,
                                            batch_size=args.train_batch_size)

    model.train()
    for _ in run_squad.trange(int(args.num_train_epochs), desc="Epoch"):
        for step, batch in enumerate(
                run_squad.tqdm(train_dataloader, desc="Iteration")):
            # Terminate early for benchmarking

            if args.max_steps > 0 and global_step > args.max_steps:
                break

            if torch.cuda.device_count() == 1:
                batch = tuple(
                    t.cuda()
                    for t in batch)  # multi-gpu does scattering it-self
            input_ids, input_mask, segment_ids, start_positions, end_positions = batch
            loss = model(input_ids, segment_ids, input_mask, start_positions,
                         end_positions)
            if torch.cuda.device_count() > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                if args.old:
                    # noinspection PyUnboundLocalVariable
                    optimizer.backward(loss)
                else:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
            else:
                loss.backward()
            # if args.fp16:
            #    optimizer.backward(loss)
            # else:
            #    loss.backward()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up for BERT which FusedAdam doesn't do
                    if not args.old:
                        # noinspection PyUnboundLocalVariable
                        scheduler.step()
                    else:
                        lr_this_step = args.learning_rate * run_squad.warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

                plain_model = getattr(model, 'module', model)
                for param_name, mask in masks.items():
                    param = get_parameter_by_name(plain_model, param_name)
                    param.data *= mask.to(param.dtype)

            if step % args.log_freq == 0:
                # logger.info("Step {}: Loss {}, LR {} ".format(global_step, loss.item(), lr_this_step))
                run_squad.logger.info("Step {}: Loss {}, LR {} ".format(
                    global_step, loss.item(), optimizer.param_groups[0]['lr']))

    return model, tokenizer
    def do_predict(self, json_data):
        eval_examples = self.read_squad_examples(input_data=json_data,
                                                 is_training=False)

        eval_writer = run_squad.FeatureWriter(filename=os.path.join(
            run_squad.FLAGS.output_dir, "eval.tf_record"),
                                              is_training=False)

        eval_features = []

        def append_feature(feature):
            eval_features.append(feature)
            eval_writer.process_feature(feature)

        run_squad.convert_examples_to_features(
            examples=eval_examples,
            tokenizer=self.tokenizer,
            max_seq_length=run_squad.FLAGS.max_seq_length,
            doc_stride=run_squad.FLAGS.doc_stride,
            max_query_length=run_squad.FLAGS.max_query_length,
            is_training=False,
            output_fn=append_feature)
        eval_writer.close()

        tf.logging.info("***** Running predictions *****")
        tf.logging.info("  Num orig examples = %d", len(eval_examples))
        tf.logging.info("  Num split examples = %d", len(eval_features))
        tf.logging.info("  Batch size = %d",
                        run_squad.FLAGS.predict_batch_size)

        all_results = []

        predict_input_fn = run_squad.input_fn_builder(
            input_file=eval_writer.filename,
            seq_length=run_squad.FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        # If running eval on the TPU, you will need to specify the number of
        # steps.
        all_results = []
        for result in self.estimator.predict(predict_input_fn,
                                             yield_single_examples=True):
            if len(all_results) % 1000 == 0:
                tf.logging.info("Processing example: %d" % (len(all_results)))
            unique_id = int(result["unique_ids"])
            start_logits = [float(x) for x in result["start_logits"].flat]
            end_logits = [float(x) for x in result["end_logits"].flat]
            all_results.append(
                run_squad.RawResult(unique_id=unique_id,
                                    start_logits=start_logits,
                                    end_logits=end_logits))

        output_prediction_file = os.path.join(run_squad.FLAGS.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(run_squad.FLAGS.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(run_squad.FLAGS.output_dir,
                                                 "null_odds.json")

        return self.write_predictions(
            eval_examples, eval_features, all_results,
            run_squad.FLAGS.n_best_size, run_squad.FLAGS.max_answer_length,
            run_squad.FLAGS.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file)
    cached_features_file = args.predict_file + '_{}_{}.bin'.format(
        args.max_seq_length, args.doc_stride)

    eval_examples = read_squad_examples(
        input_file=args.predict_file,
        is_training=False,
        version_2_with_negative=args.version_2_with_negative)

    try:
        with open(cached_features_file, "rb") as reader:
            eval_features = pickle.load(reader)
    except:
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)
        with open(cached_features_file, "wb") as writer:
            pickle.dump(eval_features, writer)

    dtype = np.int64

    def batch(iterable, n=1):
        l = len(iterable)
        for ndx in range(0, l, n):
            unique_ids = ()
            example_indices = ()
            input_ids_data = ()
            input_mask_data = ()
Пример #13
0
def get_dataloader_fn(
    precision: str = 'fp32',
    batch_size: int = 8,
    vocab_file: str = "",
    do_lower_case: bool = True,
    predict_file: str = "",
    max_len: int = 512,
    max_seq_length: int = 384,
    doc_stride: int = 128,
    max_query_length: int = 64,
    version_2_with_negative: bool = False,
    pad_to_batch_size: bool = True,
):

    # Preprocess input data
    tokenizer = BertTokenizer(vocab_file,
                              do_lower_case=do_lower_case,
                              max_len=max_len)

    eval_examples = read_squad_examples(
        input_file=predict_file,
        is_training=False,
        version_2_with_negative=version_2_with_negative)
    eval_features = convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=False,
    )

    # get inputs
    all_unique_ids = [f.unique_id for f in eval_features]
    all_input_ids = [f.input_ids for f in eval_features]
    all_input_mask = [f.input_mask for f in eval_features]
    all_segment_ids = [f.segment_ids for f in eval_features]

    if pad_to_batch_size:
        # each batch should have a fixed size
        f = eval_features[-1]
        padding = batch_size - (len(all_unique_ids) % batch_size)
        all_unique_ids += [f.unique_id for _ in range(padding)]
        all_input_ids += [f.input_ids for _ in range(padding)]
        all_input_mask += [f.input_mask for _ in range(padding)]
        all_segment_ids += [f.segment_ids for _ in range(padding)]

    all_unique_ids = torch.tensor(all_unique_ids,
                                  dtype=torch.int32,
                                  requires_grad=False)
    all_input_ids = torch.tensor(all_input_ids,
                                 dtype=torch.int32,
                                 requires_grad=False)
    all_input_mask = torch.tensor(all_input_mask,
                                  dtype=torch.int32,
                                  requires_grad=False)
    all_segment_ids = torch.tensor(all_segment_ids,
                                   dtype=torch.int32,
                                   requires_grad=False)
    eval_data = torch.utils.data.TensorDataset(all_unique_ids, all_input_ids,
                                               all_input_mask, all_segment_ids)
    eval_sampler = torch.utils.data.SequentialSampler(eval_data)
    eval_dataloader = torch.utils.data.DataLoader(
        eval_data,
        sampler=eval_sampler,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )

    dtype = {'fp32': np.float32, 'fp16': np.float16}
    dtype = dtype[precision]

    def _get_dataloader():
        """return dataloader for inference"""
        for unique_id, input_ids, input_mask, segment_ids in eval_dataloader:
            unique_id = unique_id.cpu().numpy()
            input_ids = input_ids.cpu().numpy()
            input_mask = input_mask.cpu().numpy()
            segment_ids = segment_ids.cpu().numpy()
            x = {
                "input__0": input_ids,
                "input__1": segment_ids,
                "input__2": input_mask
            }
            y_real = {
                "output__0": np.zeros([batch_size, max_seq_length],
                                      dtype=dtype),
                "output__1": np.zeros([batch_size, max_seq_length],
                                      dtype=dtype),
            }
            yield (unique_id, x, y_real)

    return _get_dataloader
Пример #14
0
#    is_training=False)
eval_features = []


def append_feature(feature):
    eval_features.append(feature)
    #eval_writer.process_feature(feature)


tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=True)

run_squad.convert_examples_to_features(examples=eval_examples,
                                       tokenizer=tokenizer,
                                       max_seq_length=seq_length,
                                       doc_stride=doc_stride,
                                       max_query_length=query_length,
                                       is_training=False,
                                       output_fn=append_feature)

########### Re-load model from saved checkpoint ###########
#unique_ids = tf.placeholder([], tf.int64)
input_ids = tf.placeholder(tf.int64, [None, seq_length])
input_mask = tf.placeholder(tf.int64, [None, seq_length])
segment_ids = tf.placeholder(tf.int64, [None, seq_length])

(start_logits,
 end_logits) = run_squad.create_model(bert_config=bert_config,
                                      is_training=False,
                                      input_ids=input_ids,
                                      input_mask=input_mask,
Пример #15
0
    def get_dataset(self,
                    dataset_path,
                    is_training,
                    context_truncated_len=400,
                    utterance_truncated_len=100):
        examples = read_squad_examples(dataset_path, is_training)

        if self.ctx_emb == 'bert':
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif self.ctx_emb == 'xlnet':
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

        features = convert_examples_to_features(examples,
                                                tokenizer,
                                                max_seq_length=2500,
                                                doc_stride=2500,
                                                max_query_length=2500,
                                                is_training=is_training)

        with open(dataset_path) as f:
            raw_examples = json.load(f)

        # since problems are flatten by convert_examples_to_features
        index_feature = 0

        for example in tqdm(raw_examples['data']):
            for paragraph in example['paragraphs']:
                paragraph['context_raw'] = paragraph['context']

                # Since only `qa_feature.token_to_orig_map` (below) maps token
                # to space-splited-word-level indices in the context,
                # `word_offsets` is required to map space-splited-word-level
                # indices to char-level indices.
                word_offsets = [0]
                for word in paragraph['context'].split(' '):
                    word_offsets.append(len(word) + 1 + word_offsets[-1])

                for index_q, qa in enumerate(paragraph['qas']):
                    qa_feature = features[index_feature]
                    index_feature += 1
                    # in `features[index_feature].segment_ids`, question and
                    # context are concatenated. To seperate them, 0/1 stored
                    # in `segment_ids` are used.
                    question_len = qa_feature.segment_ids.index(1)
                    question = qa_feature.input_ids[:question_len]

                    if index_q == 0:  # do only once for a paragraph
                        context_len = \
                            qa_feature.segment_ids[question_len:].index(0)
                        context = (
                            # [question[0]]  # [CLS] token
                            qa_feature.input_ids[question_len:question_len +
                                                 context_len])
                        paragraph['context_offset'] = (
                            # [0]
                            [
                                word_offsets[qa_feature.token_to_orig_map[i]]
                                for i in range(question_len, question_len +
                                               context_len - 1)
                            ] + [len(paragraph['context'])])
                        paragraph['context_tokenized'] = qa_feature.input_ids
                        paragraph['context'] = context

                    qa['question_tokenized'] = tokenizer.tokenize(
                        qa['question'])
                    qa['question'] = question
                    qa['orig_answer_raw'] = qa['orig_answer']['text']
                    qa['orig_answer_text'] = tokenizer.tokenize(
                        qa['orig_answer_raw'])
                    qa['orig_answer_start'] = qa_feature.start_position - question_len
                    qa['orig_answer_end'] = qa_feature.end_position - question_len
                    assert qa['orig_answer_end'] < len(paragraph['context'])

                    # answer indicator for previous questions
                    qa['answer_indicator'] = [0] * context_len
                    for offset in range(1, min(3 + 1, index_q + 1)):
                        index_prev = index_q - offset
                        start, end = (
                            paragraph['qas'][index_prev]['orig_answer_start'],
                            paragraph['qas'][index_prev]['orig_answer_end'] +
                            1)
                        qa['answer_indicator'][start:end] = ([offset] *
                                                             (end - start))

                    if is_training:
                        for answer in qa['answers']:
                            answer['raw'] = answer['text']
                            answer['text'] = tokenizer.tokenize(answer['text'])

        return QuACDataset(raw_examples['data'],
                           context_truncated_len=context_truncated_len,
                           utterance_truncated_len=utterance_truncated_len,
                           padding=0)
Пример #16
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  bert_config = rs.modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  rs.validate_flags_or_throw(bert_config)

  tf.gfile.MakeDirs(FLAGS.output_dir)

  tokenizer = rs.tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  if FLAGS.do_train:
    train_examples = rs.read_squad_examples(
        input_file=FLAGS.train_file, is_training=True)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    # Pre-shuffle the input to avoid having to make a very large shuffle
    # buffer in in the `input_fn`.
    rng = random.Random(12345)
    rng.shuffle(train_examples)

  model_fn = rs.model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    # We write to a temporary file to avoid storing very large constant tensors
    # in memory.
    train_writer = rs.FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "train.tf_record"),
        is_training=True)
    rs.convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=True,
        output_fn=train_writer.process_feature)
    train_writer.close()

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num orig examples = %d", len(train_examples))
    tf.logging.info("  Num split examples = %d", train_writer.num_features)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    del train_examples

    train_input_fn = rs.input_fn_builder(
        input_file=train_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_predict:
    eval_examples = rs.read_squad_examples(
        input_file=FLAGS.predict_file, is_training=False)

    act_seq_len = get_act_seq_len(eval_examples, tokenizer, FLAGS.max_seq_length,
                    FLAGS.doc_stride, FLAGS.max_query_length)

    eval_writer = rs.FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "eval.tf_record"),
        is_training=False)
    eval_features = []

    def append_feature(feature):
      eval_features.append(feature)
      eval_writer.process_feature(feature)

    rs.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=False,
        output_fn=append_feature)
    eval_writer.close()

    tf.logging.info("***** Running predictions *****")
    tf.logging.info("  Num orig examples = %d", len(eval_examples))
    tf.logging.info("  Num split examples = %d", len(eval_features))
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    all_results = []

    predict_input_fn = rs.input_fn_builder(
        input_file=eval_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)
    
    # If running eval on the TPU, you will need to specify the number of
    # steps.
    all_results = []
    for idx, result in enumerate(estimator.predict(
        predict_input_fn, yield_single_examples=True)):
      if len(all_results) % 1000 == 0:
        tf.logging.info("Processing example: %d" % (len(all_results)))
      unique_id = int(result["unique_ids"])
      start_logits = [float(x) for x in result["start_logits"].flat]
      end_logits = [float(x) for x in result["end_logits"].flat]
      all_results.append(
          rs.RawResult(
              unique_id=unique_id,
              start_logits=start_logits[:act_seq_len[idx]],
              end_logits=end_logits[:act_seq_len[idx]]))

    output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json")
    output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json")

    rs.write_predictions(eval_examples, eval_features, all_results,
                         FLAGS.n_best_size, FLAGS.max_answer_length,
                         FLAGS.do_lower_case, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file)
Пример #17
0
# Predict all tokens
start_logits, end_logits = model(tokens_tensor, segments_tensors,input_mask)
start_ind=torch.argmax(start_logits).item()
end_ind=torch.argmax(end_logits).item()

print(all_tokens[start_ind:end_ind+1])




#
#Messing around, trying to recreate what happened in run_squad.py

predict_file='/data/squad/dev-v1.1.json'
#eval_examples is a list of 10570 'SquadExample' objects
#each object contains fields for qas_id, question_text, and doc_tokens, 
eval_examples = run_squad.read_squad_examples(input_file=predict_file, is_training=False)

eval_features = run_squad.convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=False)

#write_predictions(eval_examples, eval_features, all_results,
#                          args.n_best_size, args.max_answer_length,
#                          args.do_lower_case, output_prediction_file,
#                          output_nbest_file, args.verbose_logging)
Пример #18
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--json_file",
        default=None,
        type=str,
        help=
        "predictions jsonfile location (output of run_squad). E.g., train-v1.1.json"
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )
    parser.add_argument("--OG", action='store_true', help="test")

    args = parser.parse_args()

    with open(args.json_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)


#    if not os.path.exists(args.output_dir):
#        os.makedirs(args.output_dir)
    train_examples = run_squad.read_squad_examples(
        args.json_file, is_training=True, version_2_with_negative=True)
    max_seq_len = 384
    max_query_len = 64
    max_answer_len = 30

    exceed_seq_lens = []
    exceed_query_lens = []
    exceed_answer_lens = []

    exceed_seq_len_counter = 0
    exceed_query_len_counter = 0
    exceed_answer_len_counter = 0
    overall_counter = 0

    max_s = 0
    max_q = 0
    max_a = 0

    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased',
        do_lower_case=True)  # added_flag, currently hardcoded

    train_features = run_squad.convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=512,
        doc_stride=128,
        max_query_length=512,
        is_training=True)

    for example in train_features:
        overall_counter += 1
        if sum(example.input_mask) > max_seq_len:
            exceed_seq_lens.append(example.tokens)
            exceed_seq_len_counter += 1
        if sum(example.input_mask) > max_s:
            max_s = sum(example.input_mask)
        if sum(example.segment_ids_flipped) > max_query_len:
            exceed_query_lens.append(example.tokens)
            exceed_query_len_counter += 1
        if sum(example.segment_ids_flipped) > max_q:
            max_q = sum(example.segment_ids_flipped)
        if (example.end_position - example.start_position) > max_answer_len:
            exceed_answer_len_counter += 1
            exceed_answer_lens.append(example.tokens)
        if (example.end_position - example.start_position) > max_a:
            max_a = (example.end_position - example.start_position)

    print("Number of examples: %d." % overall_counter)
    print("Number of sequences that exceeded max_seq_len of %d is %d." %
          (max_seq_len, exceed_seq_len_counter))
    print("Number of queries that exceeded max_query_len of %d is %d." %
          (max_query_len, exceed_query_len_counter))
    print("Number of answers that exceeded max_answer_len of %d is %d." %
          (max_answer_len, exceed_answer_len_counter))
    print("Max seq length found was %d." % max_s)
    print("Max query length found was %d." % max_q)
    print("Max answer length found was %d." % max_a)