Пример #1
0
def load_and_cache_examples(args, tokenizer, output_examples=False):
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
            'dev',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length)))
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_squad_examples(
            input_file=input_file,
            is_training=True,
            version_2_with_negative=args.version_2_with_negative)
        features = convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    all_start_positions = torch.tensor([f.start_position for f in features],
                                       dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in features],
                                     dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_start_positions, all_end_positions,
                            all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #2
0
    def _create_dataset(self, examples, evaluate=False, output_examples=False):
        features = convert_examples_to_features(examples=examples,
                                                tokenizer=self._tokenizer,
                                                max_seq_length=self._max_seq_length,
                                                doc_stride=self._doc_stride,
                                                max_query_length=self._max_query_length,
                                                is_training=not evaluate)

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

        if evaluate:
            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                    all_example_index, all_cls_index, all_p_mask)
        else:
            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                    all_start_positions, all_end_positions,
                                    all_cls_index, all_p_mask)

        if output_examples:
            return dataset, examples, features
        return dataset
    def preprocess_data(self, path_to_stream, input_question):
        """
        preprocess input data

        params:
            path_to_stream: document content 
            input_question: input file
        """
        examples = read_squad_examples(input_stream=path_to_stream, 
                                        is_training=False, 
                                        version_2_with_negative=False,
                                        updated_question=input_question)
        features = convert_examples_to_features(examples=examples, 
                                                tokenizer=self._tokenizer,
                                                max_seq_length=192,
                                                doc_stride=128,
                                                max_query_length=64,
                                                is_training=False)
    
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
        return examples, features, dataset
Пример #4
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    if args.model_type=='bert':
        cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_fixbug_{}'.format('dev' if evaluate else input_file.split('/')[1].split('.')[0]))
    else:
        cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_fixbug_{}_{}'.format('dev' if evaluate else input_file.split('/')[1].split('.')[0], args.model_type))
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_squad_examples(input_file=input_file,
                                                is_training=not evaluate,
                                                version_2_with_negative=args.version_2_with_negative)
        features = convert_examples_to_features(examples=examples,
                                                tokenizer=tokenizer,
                                                max_seq_length=args.max_seq_length,
                                                doc_stride=args.doc_stride,
                                                max_query_length=args.max_query_length,
                                                is_training=not evaluate,
                                                cls_token='[CLS]' if args.model_type == 'bert' else '<s>',
                                                sep_token='[SEP]' if args.model_type == 'bert' else '</s>',
                                                pad_token=0 if args.model_type == 'bert' else 1,
                                                cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
                                                pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
                                                cls_token_at_end=True if args.model_type in ['xlnet'] else False,
                                                sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)

        all_answer_masks = torch.tensor([f.answer_mask for f in features], dtype=torch.float)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions, all_answer_masks,
                                all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #5
0
def feature_extract(context, question, tokenizer):
    """ Converts context and questions to input features """
    examples = read_squad_repl(context, question)
    features = convert_examples_to_features(examples=examples,
                                            tokenizer=tokenizer,
                                            max_seq_length=384,
                                            doc_stride=128,
                                            max_query_length=64,
                                            is_training=False)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_example_index, all_cls_index, all_p_mask)
    batch = tuple(t.to("cpu") for t in dataset[0])
    inputs = {
        'input_ids': batch[0].unsqueeze(0),
        'attention_mask': batch[1].unsqueeze(0),  # XLM don't use segment_ids
        'token_type_ids': batch[2].unsqueeze(0)
    }
    example_indices = batch[3].unsqueeze(0)
    return inputs, features, examples, example_indices
Пример #6
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = [args.predict_file] if evaluate else glob.glob(os.path.join(args.train_file,'*.json'))
    cached_features_file = os.path.join(os.path.dirname(input_file[0]), 'cached_meta_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length)))
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_list = torch.load(cached_features_file)
    else:
        features_list = []
        for infile in input_file:
            logger.info("Creating features from dataset file at %s", infile)
            examples = read_squad_examples(input_file=infile,
                                                    is_training=not evaluate,
                                                    version_2_with_negative=args.version_2_with_negative)
            features_list.append(convert_examples_to_features(examples=examples,
                                                    tokenizer=tokenizer,
                                                    max_seq_length=args.max_seq_length,
                                                    doc_stride=args.doc_stride,
                                                    max_query_length=args.max_query_length,
                                                    is_training=not evaluate,
                                                    cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
                                                    pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
                                                    cls_token_at_end=True if args.model_type in ['xlnet'] else False,
                                                    sequence_a_is_doc=True if args.model_type in ['xlnet'] else False))
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features_list, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    def build_dataset(features):
        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
        if evaluate:
            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                    all_example_index, all_cls_index, all_p_mask)
        else:
            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                    all_start_positions, all_end_positions,
                                    all_cls_index, all_p_mask)
        return dataset

    datasets = [build_dataset(features) for features in features_list]
    return datasets
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length)))
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_squad_examples(input_file=input_file,
                                       is_training=not evaluate,
                                       version_2_with_negative=args.version_2_with_negative)
        features = convert_examples_to_features(examples=examples,
                                                tokenizer=tokenizer,
                                                max_seq_length=args.max_seq_length,
                                                doc_stride=args.doc_stride,
                                                max_query_length=args.max_query_length,
                                                is_training=not evaluate)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor(
        [f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor(
        [f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor(
        [f.segment_ids for f in features], dtype=torch.long)
    all_cls_index = torch.tensor(
        [f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(
            all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor(
            [f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions,
                                all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #8
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
            'test',
            list(filter(None, args.save_dir.split('/'))).pop(),
            str(args.max_seq_length)))
    # if os.path.exists(cached_features_file):
    #     logger.info("Loading features from cached file %s", cached_features_file)
    #     features = torch.load(cached_features_file)
    # else:
    logger.info("Creating features from dataset file at %s", input_file)
    examples = read_squad_examples(
        input_file=input_file,
        is_training=not evaluate,
        version_2_with_negative=args.version_2_with_negative)
    features = convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=not evaluate)
    # if args.local_rank in [-1, 0]:
    #     logger.info("Saving features into cached file %s", cached_features_file)
    #     torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_example_index, all_cls_index, all_p_mask)

    return dataset, examples, features
Пример #9
0
    def batch_predict_top_k(self, passages, questions, k, cutoff=8):
        examples = []
        id = 0
        for passage, question in zip(passages, questions):
            example = read_squad_example(passage, question, id)
            examples.append(example)
            id += 1
        features = convert_examples_to_features(examples, self.tokenizer,
                                                self.max_seq_length,
                                                self.doc_stride,
                                                self.max_query_length, False)

        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index)
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=16)

        all_results = []
        for batch in eval_dataloader:
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
                example_indices = batch[3]
                outputs = self.model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                result = RawResult(unique_id=unique_id,
                                   start_logits=to_list(outputs[0][i]),
                                   end_logits=to_list(outputs[1][i]))
                all_results.append(result)

        answers = get_all_predictions(examples, features, all_results, k,
                                      self.max_answer_length,
                                      self.do_lower_case, True, cutoff)

        return answers
Пример #10
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):

    examples = read_squad_examples(
        input_data=args.input_data,
        version_2_with_negative=args.version_2_with_negative)
    features = convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor(
            [f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features],
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions,
                                all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #11
0
def load_and_cache_examples(input_file, tokenizer):
    cached_features_file = 'Ecached_dev_{}_{}'.format(model_name,
                                                      str(max_seq_length))
    """ 
    if os.path.exists(cached_features_file):
        #print("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    """
    print("Creating features from dataset file at %s", input_file)
    examples = read_squad_examples(input_file=input_file,
                                   is_training=False,
                                   version_2_with_negative=False)
    features = convert_examples_to_features(examples=examples,
                                            tokenizer=tokenizer,
                                            max_seq_length=max_seq_length,
                                            doc_stride=doc_stride,
                                            max_query_length=max_query_length,
                                            is_training=False)
    print("Saving features into cached file %s", cached_features_file)
    torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_example_index, all_cls_index, all_p_mask)
    return dataset, examples, features
Пример #12
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [
            -1, 0
    ] and not evaluate and not args.no_distributed_training:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    # ALON adding a different path for the feature cache...
    if not os.path.exists('data/'):
        os.mkdir('data/')
    if not os.path.exists('data/pytorch_transformers_cache'):
        os.mkdir('data/pytorch_transformers_cache')
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        'data/pytorch_transformers_cache/', 'cached_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length)))
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_squad_examples(
            input_file=input_file,
            is_training=not evaluate,
            version_2_with_negative=args.version_2_with_negative)

        # ALON support sampling the input and the evaluation
        if args.sample_size != -1:
            examples = random.sample(examples, k=args.sample_size)

        features = convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate)
        # TODO features should be recalc when data changed
        #if args.local_rank in [-1, 0] or args.no_distributed_training:
        #    logger.info("Saving features into cached file %s", cached_features_file)
        #    torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate and not args.no_distributed_training:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor(
            [f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features],
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions,
                                all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #13
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    examples = None
    features = None
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file),
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)

        examples = read_squad_examples(input_file=input_file,
                                       is_training=not evaluate,
                                       do_lower_case=args.do_lower_case)
        features = convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        dataset = TensorDataset(
            all_input_ids,
            all_input_mask,
            all_segment_ids,
            all_example_index,
        )
    else:
        all_start_positions = torch.tensor(
            [f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features],
                                         dtype=torch.long)
        dataset = TensorDataset(
            all_input_ids,
            all_input_mask,
            all_segment_ids,
            all_start_positions,
            all_end_positions,
        )

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #14
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    """ read_squad_data + convert_to_feature + convert_to_TensorDataset
    
    Arguments:
        args {[type]} -- [description]
        tokenizer {[type]} -- [description]
    
    Keyword Arguments:
        evaluate {bool} -- [description] (default: {False})
        output_examples {bool} -- 是否也返回 examples 和 features (default: {False})
    
    Returns:
        [type] -- [description]
    """
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length)))
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_squad_examples(
            input_file=input_file,
            is_training=not evaluate,
            version_2_with_negative=args.version_2_with_negative)
        features = convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor(
            [f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features],
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions,
                                all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--job_id", default='tmp', type=str, help='Jobid to save training logs')
    parser.add_argument("--data_dir",default=None,type=str,help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--teacher_model",default=None,type=str,help="The teacher model dir.")
    parser.add_argument("--student_model",default=None,type=str,help="The student model dir.")
    parser.add_argument("--output_dir",default='output',type=str,help="The output directory where the model predictions and checkpoints will be written.")

    # default params for SQuAD
    parser.add_argument('--version_2_with_negative', 
                        action='store_true')
    parser.add_argument("--max_seq_length",
                        default=384,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--doc_stride", default=128, type=int,
                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
    parser.add_argument("--n_best_size", default=20, type=int,
                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
                             "output file.")
    parser.add_argument("--max_answer_length", default=30, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
    parser.add_argument('--null_score_diff_threshold',
                        type=float, default=0.0,
                        help="If null_score - best_non_null is greater than the threshold predict null.")
    
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--weight_decay', '--wd',
                        default=1e-4,
                        type=float,
                        metavar='W',
                        help='weight decay')
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument('--eval_step', type=int, default=200)
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    
    parser.add_argument('--do_eval',default = 0,type=int)
     # distillation params
    parser.add_argument('--aug_train', action='store_true',
                        help="Whether using data augmentation or not")
    parser.add_argument('--kd_type', default='no_kd', choices=['no_kd', 'two_stage', 'logit_kd', 'joint_kd'],
                        help="choose one of the kd type")
    parser.add_argument('--distill_logit', action='store_true',
                        help="Whether using distillation over logits or not")
    parser.add_argument('--distill_rep_attn', action='store_true',
                        help="Whether using distillation over reps and attns or not")
    parser.add_argument('--temperature', type=float, default=1.)
    # quantization params
    parser.add_argument("--weight_bits", default=32, type=int, help="number of bits for weight")
    parser.add_argument("--weight_quant_method", default='twn', type=str,
                        choices=['twn', 'bwn', 'uniform', 'laq'],
                        help="weight quantization methods, can be bwn, twn, laq")
    parser.add_argument("--input_bits",  default=32, type=int,
                        help="number of bits for activation")
    parser.add_argument("--input_quant_method", default='uniform', type=str, choices=['uniform', 'lsq'],
                        help="weight quantization methods, can be bwn, twn, or symmetric quantization for default")

    parser.add_argument('--learnable_scaling', action='store_true', default=True)
    parser.add_argument("--ACT2FN", default='gelu', type=str,
                        help='activation fn for ffn-mid. A8 uses uq + gelu; A4 uses lsq + relu.')
    # training config
    parser.add_argument('--sym_quant_ffn_attn', action='store_true',
                        help='whether use sym quant for attn score and ffn after act') # default asym
    parser.add_argument('--sym_quant_qkvo', action='store_true',  default=True,
                        help='whether use asym quant for Q/K/V and others.') # default sym
    # layerwise quantization config
    parser.add_argument('--clip_init_file', default='threshold_std.pkl', help='files to restore init clip values.')
    parser.add_argument('--clip_init_val', default=2.5, type=float, help='init value of clip_vals, default to (-2.5, +2.5).')
    parser.add_argument('--clip_lr', default=1e-4, type=float, help='Use a seperate lr for clip_vals / stepsize')
    parser.add_argument('--clip_wd', default=0.0, type=float, help='weight decay for clip_vals / stepsize')

    # layerwise quantization config
    parser.add_argument('--embed_layerwise', default=False, type=lambda x: bool(int(x)))
    parser.add_argument('--weight_layerwise', default=True, type=lambda x: bool(int(x)))
    parser.add_argument('--input_layerwise', default=True, type=lambda x: bool(int(x)))

    ### spliting
    parser.add_argument('--split', action='store_true',
                        help='whether to conduct tws spliting. NOTE this is only for training binarybert')
    parser.add_argument('--is_binarybert', action='store_true',
                        help='whether to use binarybert modelling.')

    args = parser.parse_args()

    log_dir = os.path.join(args.output_dir, 'record_%s.log' % args.job_id)
    init_logging(log_dir)
    print_args(vars(args))

    # Prepare devices
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    logging.info("device: {} n_gpu: {}".format(device, n_gpu))

    # Prepare seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    tokenizer = BertTokenizer.from_pretrained(args.teacher_model, do_lower_case=True)
    config = BertConfig.from_pretrained(args.teacher_model)
    config.num_labels = 2

    student_config = copy.deepcopy(config)
    student_config.weight_bits = args.weight_bits
    student_config.input_bits = args.input_bits
    student_config.weight_quant_method = args.weight_quant_method
    student_config.input_quant_method = args.input_quant_method
    student_config.clip_init_val = args.clip_init_val
    student_config.learnable_scaling = args.learnable_scaling
    student_config.sym_quant_qkvo = args.sym_quant_qkvo
    student_config.sym_quant_ffn_attn = args.sym_quant_ffn_attn
    student_config.embed_layerwise = args.embed_layerwise
    student_config.weight_layerwise = args.weight_layerwise
    student_config.input_layerwise = args.input_layerwise
    student_config.hidden_act = args.ACT2FN
    logging.info("***** Training data *****")
    input_file = 'train-v2.0.json' if args.version_2_with_negative else 'train-v1.1.json'
    input_file = os.path.join(args.data_dir,input_file)

    if os.path.exists(input_file+'.features.pkl'):
        logging.info("  loading from cache %s", input_file+'.features.pkl')
        train_features = pickle.load(open(input_file+'.features.pkl', 'rb'))
    else:
        _, train_examples = read_squad_examples(input_file=input_file, is_training=True, 
                                                version_2_with_negative=args.version_2_with_negative)
        train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
        pickle.dump(train_features, open(input_file+'.features.pkl','wb'))
        
    args.batch_size = args.batch_size // args.gradient_accumulation_steps
    num_train_optimization_steps = int(
        len(train_features) / args.batch_size / args.gradient_accumulation_steps) * args.num_train_epochs

    logging.info("  Num examples = %d", len(train_features))
    logging.info("  Num total steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)

    logging.info("***** Evaluation data *****")
    input_file = 'dev-v2.0.json' if args.version_2_with_negative else 'dev-v1.1.json'
    args.dev_file = os.path.join(args.data_dir,input_file)
    dev_dataset, eval_examples = read_squad_examples(
                        input_file=args.dev_file, is_training=False,
                        version_2_with_negative=args.version_2_with_negative)
    eval_features = convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=False)

    logging.info("  Num examples = %d", len(eval_features))

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

    if not args.do_eval:
        from transformer.modeling_dynabert import BertForQuestionAnswering
        teacher_model = BertForQuestionAnswering.from_pretrained(args.teacher_model, config = config)
        teacher_model.to(device)
        if n_gpu > 1:
            teacher_model = torch.nn.DataParallel(teacher_model)

    if args.split:
        # rename the checkpoint to restore
        split_model_dir = os.path.join(args.output_dir,'binary_model_init')
        if not os.path.exists(split_model_dir):
            os.mkdir(split_model_dir)
        # copy the json file, avoid over-writing
        source_model_dir = os.path.join(args.student_model, CONFIG_NAME)
        target_model_dir = os.path.join(split_model_dir, CONFIG_NAME)
        os.system('cp -v %s %s' % (source_model_dir, target_model_dir))

        # create the split model ckpt
        source_model_dir = os.path.join(args.student_model, WEIGHTS_NAME)
        target_model_dir = os.path.join(split_model_dir, WEIGHTS_NAME)
        target_model_dir = tws_split(source_model_dir, target_model_dir)
        args.student_model = split_model_dir  # over-write student_model dir
        print("transformed binary model stored at: {}".format(target_model_dir))

    if args.is_binarybert:
        from transformer.modeling_dynabert_binary import BertForQuestionAnswering
        student_model = BertForQuestionAnswering.from_pretrained(args.student_model, config=student_config)
    else:
        from transformer.modeling_dynabert_quant import BertForQuestionAnswering
        student_model = BertForQuestionAnswering.from_pretrained(args.student_model, config=student_config)
    student_model.to(device)
    if n_gpu > 1:
        student_model = torch.nn.DataParallel(student_model)

    learner = KDLearner(args, device, student_model, teacher_model,num_train_optimization_steps)

    if args.do_eval:
        """ evaluation """
        learner.eval(student_model, eval_dataloader, eval_features, eval_examples, dev_dataset)
        return 0

    """ perform training """
    if args.kd_type == 'joint_kd':
        learner.args.distill_logit = True
        learner.args.distill_rep_attn = True
        learner.build()
        learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset)

    elif args.kd_type == 'logit_kd':
        # only perform the logits kd
        learner.args.distill_logit = True
        learner.args.distill_rep_attn = False
        learner.build(lr=args.learning_rate)
        learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset)

    elif args.kd_type == 'two_stage':
        # stage 1: intermediate layer distillation
        learner.args.distill_logit = False
        learner.args.distill_rep_attn = True
        learner.build(lr=2.5*args.learning_rate)
        learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset)

        # stage 2: prediction layer distillation
        learner.student_model.load_state_dict(torch.load(os.path.join(learner.output_dir,WEIGHTS_NAME)))
        learner.args.distill_logit = True
        learner.args.distill_rep_attn = False

        learner.build(lr=args.learning_rate)  # prepare the optimizer again.
        learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset)

    else:
        assert args.kd_type == 'no_kd'
        # NO kd training, vanilla cross entropy with hard label
        learner.build(lr=args.learning_rate)  # prepare the optimizer again.
        learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset)

    del learner
    return 0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    global_rank = -1 if args.local_rank == -1 else torch.distributed.get_rank()
    if global_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cache_filename = 'cached.split={}.mn={}.msl={}.mql={}'.format(
        '.'.join(input_file.split('/')[-1].split('.')[:-1]),
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        str(args.max_seq_length),
        str(args.max_query_length))
    if args.num_shards != 1:
        cache_filename += '.num_shards={}.shard_no={}'.format(args.num_shards, args.shard_no)
    cached_features_file = os.path.join(os.path.dirname(input_file), cache_filename)
    print('Cached features file {} exists: {}'.format(cached_features_file, str(os.path.exists(cached_features_file))))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        if args.num_shards == 1:
            logger.info("Loading features from cached file %s", cached_features_file)
            features = torch.load(cached_features_file)
        else:
            logger.info("Loading features from cached files %s", cached_features_file)
            features = []
            for shard_no in tqdm(range(args.num_shards)):
                features += torch.load(cached_features_file[:-1] + str(shard_no))
        if output_examples:
            logger.info("Reading examples from file %s", input_file)
            examples = read_squad_examples(input_file=input_file,
                                                    is_training=not evaluate,
                                                    version_2_with_negative=args.version_2_with_negative)
    else:
        logger.info("Reading examples from dataset file at %s", input_file)
        examples = read_squad_examples(input_file=input_file,
                                                is_training=not evaluate,
                                                version_2_with_negative=args.version_2_with_negative)
        logger.info("Creating features from dataset file at %s", input_file)
        features = convert_examples_to_features(examples=examples,
                                                tokenizer=tokenizer,
                                                max_seq_length=args.max_seq_length,
                                                doc_stride=args.doc_stride,
                                                max_query_length=args.max_query_length,
                                                is_training=not evaluate,
                                                cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
                                                cls_token=tokenizer.cls_token,
                                                cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
                                                sep_token=tokenizer.sep_token,
                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
                                                num_shards=args.num_shards,
                                                shard_no=args.shard_no)
        if global_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions,
                                all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #17
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length)))
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_squad_examples(
            input_file=input_file,
            is_training=not evaluate,
            version_2_with_negative=args.version_2_with_negative)
        features = convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            cls_token_at_end=bool(args.model_type in ['xlnet']),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
            sep_token_extra=bool(args.model_type in ['roberta']),
            add_prefix_space=bool(args.model_type in ['roberta']))

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor(
            [f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features],
                                         dtype=torch.long)
        all_is_impossible = torch.tensor(
            [1.0 if f.is_impossible else 0.0 for f in features],
            dtype=torch.float)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions,
                                all_is_impossible, all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #18
0
def do_prediction(model_dir):
    # 1. Load a trained model

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
    model = BertForQuestionAnswering.from_pretrained(model_dir)
    model.to(device)
    model.eval()

    # 2. Load and pre-process the test set

    dev_file = "data/sfu.json"
    predict_batch_size = 2
    max_seq_length = 384

    eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False)

    tokenizer = BertTokenizer.from_pretrained(model_dir)
    eval_features = convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=max_seq_length,
                doc_stride=128,
                max_query_length=64,
                is_training=False)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)

    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size)

    # 3. Run inference on the test set

    all_results = []
    for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader):

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():       
            batch_start_logits, batch_end_logits = model(input_ids, input_mask, segment_ids)
                
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                                                 start_logits=start_logits,
                                                 end_logits=end_logits))
            
    output_prediction_file = os.path.join(model_dir, "predictions_sfu.json")
    output_nbest_file = os.path.join(model_dir, "nbest_predictions_sfu.json")
    output_null_log_odds_file = os.path.join(model_dir, "null_odds_sfu.json")

    preds = write_predictions(eval_examples, eval_features, all_results, 20,
                          30, True, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file, True,
                          False, 0.0)
Пример #19
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    print("input_file", input_file)

    ebay_input_file = None if evaluate else args.ebay_file

    cached_features_file = os.path.join(
        os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length)))
    print("cached_features_file", cached_features_file)
    # sys.exit(0)
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)

        # examples=[]
        examples = read_squad_examples(
            input_file=input_file,
            is_training=not evaluate,
            version_2_with_negative=args.version_2_with_negative)
        # print(examples[0])
        ## Mehdi ToDo ADD EBAY DATA TO features
        if ebay_input_file:
            ebay_examples = read_ebay_examples(
                input_file=ebay_input_file,
                is_training=not evaluate,
                version_2_with_negative=args.version_2_with_negative)
            examples.extend(ebay_examples)

        print("done reading examples")

        # sys.exit(0)
        features = convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate)

        print("done featureizing")
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)
    # sys.exit(0)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features],
                                 dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor(
            [f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features],
                                         dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions,
                                all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset