def create_batch_iter(mode):
    """构造迭代器"""
    processor, tokenizer = init_params()
    if mode == "train":
        examples = processor.get_train_examples(args.data_dir)

        num_train_steps = int(
            len(examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

        batch_size = args.train_batch_size

        logger.info("  Num steps = %d", num_train_steps)

    elif mode == "dev":
        examples = processor.get_dev_examples(args.data_dir)
        batch_size = args.eval_batch_size
    else:
        raise ValueError("Invalid mode %s" % mode)

    label_list = processor.get_labels()

    # 特征
    features = convert_examples_to_features(examples, label_list,
                                            args.max_seq_length, tokenizer)

    logger.info("  Num examples = %d", len(examples))
    logger.info("  Batch size = %d", batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)
    all_output_mask = torch.tensor([f.output_mask for f in features],
                                   dtype=torch.long)

    # 数据集
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                         all_label_ids, all_output_mask)

    if mode == "train":
        sampler = RandomSampler(data)
    elif mode == "dev":
        sampler = SequentialSampler(data)
    else:
        raise ValueError("Invalid mode %s" % mode)

    # 迭代器
    iterator = DataLoader(data, sampler=sampler, batch_size=batch_size)

    if mode == "train":
        return iterator, num_train_steps
    elif mode == "dev":
        return iterator
    else:
        raise ValueError("Invalid mode %s" % mode)
def create_batch_iter(mode):
    """构造迭代器"""
    tokenizer = init_params()
    if mode == "train":
        examples = read_qa_examples(args.data_dir, "train")
        batch_size = args.train_batch_size
    elif mode == "dev":
        examples = read_qa_examples(args.data_dir, "dev")
        batch_size = args.eval_batch_size
    else:
        raise ValueError("Invalid mode %s" % mode)

    # 特征
    features = convert_examples_to_features(examples,
                                            tokenizer,
                                            args.max_seq_length,
                                            args.doc_stride,
                                            args.max_query_length,
                                            is_training=True)

    logger.info("  Num Features = %d", len(features))
    logger.info("  Batch size = %d", batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    start_positions = torch.tensor([f.start_position for f in features],
                                   dtype=torch.long)
    end_positions = torch.tensor([f.end_position for f in features],
                                 dtype=torch.long)
    answer_types = torch.tensor([f.answer_type for f in features],
                                dtype=torch.long)

    # 数据集
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                         start_positions, end_positions, answer_types)

    if mode == "train":
        num_train_steps = int(
            len(features) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

        batch_size = args.train_batch_size

        logger.info("  Num steps = %d", num_train_steps)
        if args.local_rank == -1:
            sampler = RandomSampler(data)
        else:
            sampler = DistributedSampler(data)
    elif mode == "dev":
        sampler = SequentialSampler(data)
    else:
        raise ValueError("Invalid mode %s" % mode)

    # 迭代器
    iterator = DataLoader(data, sampler=sampler, batch_size=batch_size)

    if mode == "train":
        return iterator, num_train_steps
    elif mode == "dev":
        return iterator
    else:
        raise ValueError("Invalid mode %s" % mode)
Пример #3
0
def make_predict(model, tokenizer, data_dir):
    #read_squad_data(test_raw_data, data_dir, is_training=False)
    eval_examples = read_qa_examples(data_dir, corpus_type="test")
    eval_features = convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=False)

    logger.info("***** Running predictions *****")
    logger.info("  Num orig examples = %d", len(eval_examples))
    logger.info("  Num split examples = %d", len(eval_features))
    logger.info("  Batch size = %d", args.predict_batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    # Predicted by model. Note that one example turns into more than one features when processed by conver_example_to_features.py.
    # This means one example index corresponding to more than one features
    # In other word, for certain example, we can get more than one predict results by model.
    model.eval()
    all_results = []
    logger.info("Start evaluating")
    RawResult = collections.namedtuple("RawResult",
                                       ["unique_id", "start_logits", "end_logits", "answer_type_logits"])
    for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating",
                                                                    disable=args.local_rank not in [-1, 0]):
        if len(all_results) % 1000 == 0:
            logger.info("Processing example: %d" % (len(all_results)))
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits, batch_answer_type_logits = model(input_ids, segment_ids, input_mask)
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            answer_type_logits = batch_answer_type_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                                         start_logits=start_logits,
                                         end_logits=end_logits,
                                         answer_type_logits=answer_type_logits))

    output_prediction_file = os.path.join(args.output_dir, "predictions.json")
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
    # post process
    write_predictions(eval_examples, eval_features, all_results,
                      args.n_best_size, args.max_answer_length,
                      args.do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                      args.version_2_with_negative, args.null_score_diff_threshold)
Пример #4
0
from Io.data_loader import create_batch_iter
from preprocessing.data_processor import read_squad_data, convert_examples_to_features, read_qa_examples
from pytorch_pretrained_bert.tokenization import BertTokenizer
from predict.predict import main
#from pytorch_pretrained_bert.modeling import BertPreTrainedModel
if __name__ == "__main__":
    read_squad_data("data/small_train_data.json", "data/")
    examples = read_qa_examples("data/", "train")
    print(len(examples))
    features = convert_examples_to_features(
        examples,
        tokenizer=BertTokenizer("pretrained_model/vocab.txt"),
        max_seq_length=512,
        doc_stride=500,
        max_query_length=32,
        is_training=True)
    print(len(features))

    # main()