示例#1
0
    def load_squad_for_bert(self):
        if args['datasetsize'] == '1.1':
            self.basedir = '../MR/SQUAD/'
            self.corpus_file_train = self.basedir + 'train-v1.1.json'
            self.corpus_file_dev = self.basedir + 'dev-v1.1.json'
            self.data_dump_path = args['rootDir'] + '/SQUAD_1_bert.pkl'
            # self.vocfile = args['rootDir'] + '/voc_squad_1.txt'
            self.processor = SquadV1Processor()
        elif args['datasetsize'] == '2.0':
            self.basedir = '../MR/SQUAD/'
            self.corpus_file_train = self.basedir + 'train-v2.0.json'
            self.corpus_file_test = self.basedir + 'dev-v2.0.json'
            self.data_dump_path = args['rootDir'] + '/SQUAD_2_bert.pkl'
            # self.vocfile = args['rootDir'] + '/voc_squad_2.txt'
            self.processor = SquadV2Processor()

        datasetExist = os.path.isfile(self.data_dump_path)
        if not datasetExist:
            datasets = {'train': {}, 'dev': {}, 'test': {}}
            examples = self.processor.get_train_examples(
                self.basedir, filename='train-v1.1.json')
            features, data = squad_convert_examples_to_features(
                examples=examples,
                tokenizer=self.tokenizer,
                max_seq_length=384,
                doc_stride=128,
                max_query_length=64,
                is_training=True,
                return_dataset="pt",
                threads=1,
            )
            datasets['train']['dataset'] = data
            datasets['train']['features'] = features
            datasets['train']['examples'] = examples

            examples_dev = self.processor.get_dev_examples(
                self.basedir, filename='dev-v1.1.json')
            features_dev, data_dev = squad_convert_examples_to_features(
                examples=examples_dev,
                tokenizer=self.tokenizer,
                max_seq_length=384,
                doc_stride=128,
                max_query_length=64,
                is_training=False,
                return_dataset="pt",
                threads=1,
            )
            datasets['dev']['dataset'] = data_dev
            datasets['dev']['features'] = features_dev
            datasets['dev']['examples'] = examples_dev
            print('Saving dataset...')
            self.saveDataset(self.data_dump_path, datasets,
                             dataonly=True)  # Saving tf samples

        else:
            datasets = self.loadDataset(self.data_dump_path, dataonly=True)
            # print('train size:\t', len(dataset['train']))
            # print('test size:\t', len(dataset['test']))
            print('loaded')
        return datasets
示例#2
0
def load_and_cache_examples(tokenizer, is_training=True):
    # Load data features from cache or dataset file
    cached_features_file = "cached_{}".format("train" if is_training else "dev")

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        print("Loading features from cached file ", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        print("Creating features from dataset file")
        
        if is_training:
            examples = SquadV1Processor().get_train_examples('')
        else:
            examples = SquadV1Processor().get_dev_examples('')

        features, dataset = squad_convert_examples_to_features(
            examples,tokenizer,max_seq_length,
            doc_stride=128,
            max_query_length=64,
            is_training=is_training,
            return_dataset="pt")

        print("Saving features into cached file", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    return dataset, examples, features
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file),
        "cached_distillation_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)

        try:
            features, dataset, examples = (
                features_and_dataset["features"],
                features_and_dataset["dataset"],
                features_and_dataset["examples"],
            )
        except KeyError:
            raise DeprecationWarning(
                "You seem to be loading features from an older version of this script please delete the "
                "file %s in order for it to be created again" % cached_features_file
            )
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
        else:
            examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
示例#4
0
def convert_to_features(in_file,
                        evaluate,
                        doc_stride,
                        max_query_length,
                        max_seq_length,
                        num_workers,
                        tokenizer,
                        debug_features=False,
                        v2=False):
    processor = SquadV2Processor() if v2 else SquadV1Processor()
    data_dir = os.path.dirname(in_file)
    file_name = os.path.basename(in_file)
    if doc_stride >= max_seq_length - max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )
    if evaluate:
        examples = processor.get_dev_examples(data_dir, filename=file_name)
    else:
        examples = processor.get_train_examples(data_dir, filename=file_name)
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=not evaluate,
        return_dataset="pt",
        threads=num_workers,
    )
    if debug_features:
        debug_features_examples_dataset(dataset, examples, features, tokenizer)
    return dataset, examples, features
示例#5
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    # cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
    #     'dev' if evaluate else 'train',
    #     list(filter(None, args.model_name_or_path.split('/'))).pop(),
    #     str(args.max_seq_length))
    # )
    #
    # # Init features and dataset from cache if it exists
    # if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
    #     logger.info("Loading features from cached file %s", cached_features_file)
    #     features_and_dataset = torch.load(cached_features_file)
    #     features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
    # else:
    logger.info("Creating features from dataset file at %s", input_dir)

    if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

        if args.version_2_with_negative:
            logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

        tfds_examples = tfds.load("squad")
        examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
    else:
        processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()

        if evaluate:
            examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
        else:
            examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=not evaluate,
        return_dataset='pt',
        regression=True
    )

    # if args.local_rank in [-1, 0]:
    #     logger.info("Saving features into cached file %s", cached_features_file)
    #     torch.save({"features": features, "dataset": dataset}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    if output_examples:
        return dataset, examples, features
    return dataset
示例#6
0
def load_bert():
    # The following code is adapted from HuggingFace transformers
    # https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
    # Load pretrained model and tokenizer
    config_class, model_class, tokenizer_class = (BertConfig,
                                                  BertForQuestionAnswering,
                                                  BertTokenizer)
    config = config_class.from_pretrained(model_name_or_path,
                                          cache_dir=cache_dir)
    tokenizer = tokenizer_class.from_pretrained(model_name_or_path,
                                                do_lower_case=True,
                                                cache_dir=cache_dir)
    model = model_class.from_pretrained(model_name_or_path,
                                        from_tf=False,
                                        config=config,
                                        cache_dir=cache_dir)
    # load some examples
    processor = SquadV1Processor()
    examples = processor.get_dev_examples(None, filename=predict_file)

    # Convert examples to features
    features, dataset = squad_convert_examples_to_features(
        examples=
        examples[:
                 total_samples],  # convert just enough examples for this notebook
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=False,
        return_dataset='pt')
    return model, features, dataset
示例#7
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    # Load data features from cache or dataset file
    input_dir = args.data_dir
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        # If the processor is defined, filename will automatically defined by the processor.
        processor = SquadV2Processor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir, filename=None)
        else:
            examples = processor.get_train_examples(args.data_dir,
                                                    filename=None)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(
            {
                "features": features,
                "dataset": dataset,
                "examples": examples
            }, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
示例#8
0
def load_examples(args, tokenizer, evaluate=False, output_examples=False):
    # Load data features from dataset file

    # logger.info("Creating features from dataset file at %s", )

    if not args.data_dir and ((evaluate and not args.predict_file) or
                              (not evaluate and not args.train_file)):
        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError(
                "If not data_dir is specified, tensorflow_datasets needs to be installed."
            )

        if args.version_2_with_negative:
            logger.warn(
                "tensorflow_datasets does not handle version 2 of SQuAD.")

        tfds_examples = tfds.load("squad")
        examples = SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=evaluate)
    else:
        processor = SquadV2Processor(
        ) if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(os.path.join(
                args.data_dir, args.task),
                                                  filename=args.predict_file)
        else:
            examples = processor.get_train_examples(os.path.join(
                args.data_dir, args.task),
                                                    filename=args.train_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=not evaluate,
        return_dataset="pt",
        threads=args.threads,
    )

    # logger.info("Saving features into cached file %s", cached_features_file)
    torch.save({
        "features": features,
        "dataset": dataset,
        "examples": examples
    })

    if output_examples:
        return dataset, examples, features
    return dataset
示例#9
0
 def _create_dataset(self, examples, evaluate=False, output_examples=False):
     if not evaluate:
         features, dataset = squad_convert_examples_to_features(
             examples=examples,
             tokenizer=self._tokenizer,
             max_seq_length=self._max_seq_length,
             doc_stride=self._doc_stride,
             max_query_length=self._max_query_length,
             is_training=not evaluate,
             return_dataset="pt")  # pytorch
         return dataset
     else:
         features, dataset = squad_convert_examples_to_features(
             examples=examples,
             tokenizer=self._tokenizer,
             max_seq_length=self._max_seq_length,
             doc_stride=self._doc_stride,
             max_query_length=self._max_query_length,
             is_training=not evaluate,
             return_dataset="pt")  # pytorch
         # if output_examples:
         return dataset, examples, features
示例#10
0
def load_and_cache_examples(tokenizer, evaluate=False, output_examples=False):
    # Load data features from cache or dataset file
    input_dir = "deeplearning_needed/SQUAD_data"
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, model_type.split("/"))).pop(),
            str(384),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        processor = SquadV2Processor()
        if evaluate:
            examples = processor.get_dev_examples(input_dir,
                                                  filename='dev-v2.0.json')
        else:
            examples = processor.get_train_examples(input_dir,
                                                    filename='train-v2.0.json')

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=not evaluate,
            return_dataset="pt",
            threads=1,
        )
        torch.save(
            {
                "features": features,
                "dataset": dataset,
                "examples": examples
            }, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
示例#11
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    # Load data features from cache or dataset file
    input_dir = args.output_dir if args.output_dir else "."
    if not os.path.exists(input_dir):
        os.makedirs(input_dir)
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if evaluate:
            examples = read_squad_examples(args.predict_file, is_training=False)
        else:
            examples = read_squad_examples(args.train_file, is_training=True)
        
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt"
        )

        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
示例#12
0
def load_and_cache_examples(data_dir: str,
                            tokenizer,
                            task,
                            max_seq_length,
                            doc_stride,
                            max_query_length,
                            evaluate=False):
    if (task == "SQuAD1.1"):
        train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
        validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
        train_file = "train-v1.1.json"
        validation_file = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif (task == "SQuAD2.0"):
        train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
        validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
        train_file = "train-v2.0.json"
        validation_file = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        raise NameError("Incompatible dataset detected")

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if evaluate:
        with urllib.request.urlopen(validation_url) as url:
            with open(data_dir + "/" + validation_file, 'w') as f:
                f.write(url.read().decode())
        examples = processor.get_dev_examples(data_dir,
                                              filename=validation_file)
    else:
        with urllib.request.urlopen(train_url) as url:
            with open(data_dir + "/" + train_file, 'w') as f:
                f.write(url.read().decode())
        examples = processor.get_train_examples(data_dir, filename=train_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=not evaluate,
        return_dataset="pt",
    )
    return dataset, examples, features
示例#13
0
文件: reader.py 项目: Dyc0de/DrQA
    def __transform_to_features(self, samples):
        features, dataset = squad_convert_examples_to_features(
            examples=samples,
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq,
            doc_stride=self.doc_stride,
            max_query_length=self.max_query_length,
            is_training=False,
            return_dataset='pt',
            threads=self.workers)

        sampler = SequentialSampler(dataset)
        dataloader = DataLoader(dataset,
                                sampler=sampler,
                                batch_size=self.batch_size)

        return features, dataloader
示例#14
0
def setup_squadv1_dataset(data_dir: str, tokenizer: nn.Module, test: bool = False, **kwargs) -> Tuple[Dataset, torch.Tensor, torch.Tensor]:
    cached_path = os.path.join(data_dir, f"{'dev' if test else 'train'}v1.pth")
    if os.path.isfile(cached_path):
        ckpt = torch.load(cached_path)
        return ckpt["dataset"], ckpt["examples"], ckpt["features"]
    
    processor   = SquadV1Processor()
    fname       = f"{'dev' if test else 'train'}-v1.1.json"
    getter      = processor.get_dev_examples if test else processor.get_train_examples
    examples    = getter(data_dir, fname)
    features, dataset  = squad_convert_examples_to_features(
        examples         = examples,
        tokenizer        = tokenizer,
        is_training      = not test,
        return_dataset   = "pt",
        **kwargs
    )

    torch.save({ "dataset": dataset, "examples": examples, "features": features }, cached_path)
    return dataset, examples, features
示例#15
0
def evaluate(model, tokenizer, device, maxSequenceLength, maxQueryLength,
             documentStride):
    processor = SquadV2Processor()
    devData = processor.get_dev_examples(".")
    features, devDataset = transformers.squad_convert_examples_to_features(
        examples=devData,
        tokenizer=tokenizer,
        max_seq_length=maxSequenceLength,
        max_query_length=maxQueryLength,
        doc_stride=documentStride,
        return_dataset="pt",
        threads=1,
        is_training=False)
    batchSize = 2
    sampler = torch.utils.data.SequentialSampler(devDataset)
    dataLoader = torch.utils.data.DataLoader(devDataset,
                                             sampler=sampler,
                                             batch_size=batchSize)

    results = []
    for batch in tqdm.tqdm(devDataset):
        model.eval()
        batch = tuple(bat.to(device) for bat in batch)
        with torch.no_grad():
            startPosition = batch[3]
            outputs = model(input_ids=batch[0],
                            attention_mask=batch[1],
                            token_type_ids=batch[2])
        for i, index in enumerate(example):
            feature = features[index.item()]
            id = int(feature.unique_id)
            output = [output[i].detach().cpu().tolist() for output in outputs]
            print(len(output))
            results.append(
                transformers.data.processors.squad.SquadResult(
                    id, output[0], output[2]))
    predictions = transformers.data.metrics.squad_metrics.compute_predictions_logits(
        devData, features, results)
    results = transformers.data.metrics.squad_metrics.squad_evaluate(
        devData, predictions)
    return results
    def convert_examples_to_features(self,
                                     examples,
                                     tokenizer,
                                     output_mode='',
                                     evaluate=False):

        max_seq_length = self.configs.get('max_seq_length', 384)
        doc_stride = self.configs.get('doc_stride', 128)
        max_query_length = self.configs.get('max_query_length', 64)
        threads = self.configs.get('num_threads', 1)

        return squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=threads,
        )
示例#17
0
    async def predict(
            self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        if not os.path.isfile(
                os.path.join(self.parent.config.output_dir,
                             "pytorch_model.bin")):
            raise ModelNotTrained("Train model before prediction.")

        self.model = AutoModelForQuestionAnswering.from_pretrained(
            self.parent.config.output_dir)  # , force_download=True)
        self.model.to(self.parent.config.device)
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.parent.config.output_dir,
            do_lower_case=self.parent.config.do_lower_case,
        )
        async for record in sources.records():

            example = SquadExample(
                qas_id=record.key,
                question_text=record.feature("question"),
                context_text=record.feature("context"),
                answer_text=record.feature("answer_text"),
                start_position_character=record.feature("start_pos_char"),
                title=record.feature("title"),
                is_impossible=record.feature("is_impossible"),
                answers=record.feature("answers"),
            )
            features, dataset = squad_convert_examples_to_features(
                examples=[example],
                tokenizer=self.tokenizer,
                max_seq_length=self.parent.config.max_seq_length,
                doc_stride=self.parent.config.doc_stride,
                max_query_length=self.parent.config.max_query_length,
                is_training=False,
                return_dataset="pt",
            )
            prediction = await self._custom_accuracy([example], features,
                                                     dataset)
            record.predicted("Answer", prediction, "Nan")
            yield record
示例#18
0
    def answer(self, question):
        hits = self.searcher.search(question, k=self.k)
        ir_scores = []
        paragraphs = []
        for j in range(len(hits)):
            passage = hits[j].raw
            ir_scores.append(hits[j].score)
            paragraphs.append(passage)
        input_ = build_squad_input(question, paragraphs)
        examples = self.processor._create_examples(input_["data"], "dev")
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=self.tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            return_dataset="pt",
            threads=1,
        )
        all_results, predictions = process_one_question(
            features, dataset, self.model, self.tokenizer, examples,
            self.device, self.use_ir_score, self.mu, ir_scores)

        scores = np.array([(p['start_logit'] + p['end_logit'])
                           for p in predictions['0']])
        texts = [p['text'] for p in predictions['0']]

        predicted_p_indexes_all = scores.argsort()[::-1].argsort()
        iterator_idx = 0
        is_empty = True
        predicted_p_index = 0
        while is_empty and iterator_idx < len(predicted_p_indexes_all):
            predicted_p_index = predicted_p_indexes_all[iterator_idx]
            is_empty = texts[predicted_p_index] == "empty"
            iterator_idx += 1

        predicted_answer = texts[predicted_p_index]
        return predicted_answer
示例#19
0
文件: dataset.py 项目: Se-Hun/SOTA_QA
    def load_squad_examples(self, mode="train"):
        if self.data_dir:
            processor = SquadV2Processor(
            ) if self.version_2_with_negative else SquadV1Processor()
            if mode == "train":
                examples = processor.get_train_examples(self.data_dir,
                                                        filename="train.json")
            elif mode == "dev":
                examples = processor.get_train_examples(
                    self.data_dir, filename="dev.json"
                )  # for obtaining start positions and end positions
            elif mode == "test":
                examples = processor.get_dev_examples(self.data_dir,
                                                      filename="dev.json")
            else:
                raise KeyError(mode)

            # for debugging -- to small set
            # Uncomment out below code for debugging.
            N = 10
            examples = examples[:N]
            # --------------------------------------

            is_training = mode != "test"  # for obtaining start positions and end positions
            features, dataset = squad_convert_examples_to_features(
                examples=examples,
                tokenizer=self.tokenizer,
                max_seq_length=self.max_seq_length,
                doc_stride=self.doc_stride,
                max_query_length=self.max_query_length,
                is_training=is_training,
                return_dataset="pt",  # Return DataType is Pytorch Tensor !
                threads=2)

        if not is_training:
            return dataset, examples, features

        return dataset
示例#20
0
    async def accuracy(self, sources: Sources):
        if not os.path.isfile(
                os.path.join(self.parent.config.output_dir,
                             "pytorch_model.bin")):
            raise ModelNotTrained("Train model before assessing for accuracy.")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.parent.config.output_dir,
            do_lower_case=self.parent.config.do_lower_case,
        )
        eval_examples = await self._preprocess_data(sources)
        features, dataset = squad_convert_examples_to_features(
            examples=eval_examples,
            tokenizer=self.tokenizer,
            max_seq_length=self.parent.config.max_seq_length,
            doc_stride=self.parent.config.doc_stride,
            max_query_length=self.parent.config.max_query_length,
            is_training=False,
            return_dataset="pt",
        )

        results = {}
        if self.parent.config.local_rank in [-1, 0]:
            logger.info(
                "Loading checkpoints saved during training for evaluation")
            self.model = AutoModelForQuestionAnswering.from_pretrained(
                self.parent.config.output_dir)
            self.model.to(self.parent.config.device)

            # Evaluate
            predictions = await self._custom_accuracy(eval_examples, features,
                                                      dataset)
            results = squad_evaluate(eval_examples, predictions)

        logger.info("Results: {}".format(results))

        # return results
        return Accuracy(results["f1"])
示例#21
0
    def find_answer(self,
                    question,
                    context,
                    n_best_size=20,
                    max_answer_length=30,
                    full_sentence=False):
        # heavily inspired by "https://github.com/huggingface/transformers/blob/v2.3.0/examples/run_squad.py#L212-L317"
        example_id = '55555'
        example = SquadExample(example_id, question, context, None, None, None)

        features, dataset = squad_convert_examples_to_features(
            [example],
            self.tokenizer,
            self.max_seq_length,
            self.doc_stride,
            self.max_query_length,
            False,
            return_dataset='pt')

        sampler = SequentialSampler(dataset)
        dataloader = DataLoader(dataset, sampler=sampler, batch_size=1)

        all_results = []
        for batch in dataloader:
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.model_type in {"xlm", "roberta", "distilbert"}:
                    del inputs["token_type_ids"]

                example_index = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.model_type in {"xlnet", "xlm"}:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})

                outputs = self.model(**inputs)
                output = [o.detach().cpu().tolist() for o in outputs]

                unique_id = int(features[example_index].unique_id)

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    squad_result = SquadResult(
                        unique_id,
                        start_logits[0],
                        end_logits[0],
                        start_top_index=start_top_index[0],
                        end_top_index=end_top_index[0],
                        cls_logits=cls_logits[0],
                    )

                else:
                    start_logits, end_logits = output
                    squad_result = SquadResult(unique_id, start_logits[0],
                                               end_logits[0])

                all_results.append(squad_result)

        # XLNet and XLM use a more complex post-processing procedure
        if self.model_type in {"xlnet", "xlm"}:
            if hasattr(model, "config"):
                start_n_top = self.model.config.start_n_top
                end_n_top = self.model.config.end_n_top
            else:
                start_n_top = self.model.module.config.start_n_top
                end_n_top = self.model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                [example],
                features,
                all_results,
                n_best_size,
                max_answer_length,
                '/tmp/pred.out',
                '/tmp/nbest.out',
                '/tmp/null.out',
                start_n_top,
                end_n_top,
                self.version_2_with_negative,
                tokenizer,
                self.verbose,
            )
        else:
            predictions = compute_predictions_logits(
                [example],
                features,
                all_results,
                n_best_size,
                max_answer_length,
                self.do_lower_case,
                '/tmp/pred.out',
                '/tmp/nbest.out',
                '/tmp/null.out',
                self.verbose,
                self.version_2_with_negative,
                self.null_score_diff_threshold,
            )

        prediction = predictions[example_id]

        logger.debug(f'found prediction: "{prediction}"')

        # empty prediction indicates unknown answer
        if not prediction:
            logger.debug('empty prediction')
            return None

        if full_sentence:
            doc = self.nlp(context)
            for sent in doc.sents:
                if prediction in sent.text:
                    prediction = sent.text
                    break

        return prediction
def load_and_cache_examples(args_dict,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args_dict[RunParameters.LOCAL_RANK.value] not in [-1, 0
                                                         ] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_file = args_dict[
        RunParameters.PREDICT_FILE.value] if evaluate else args_dict[
            RunParameters.TRAIN_FILE.value]
    cached_features_file = os.path.join(
        os.path.dirname(input_file),
        "cached_distillation_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(
                filter(
                    None,
                    args_dict[RunParameters.MODEL_NAME_OR_PATH.value].split(
                        "/"))).pop(),
            str(args_dict[
                SquadArchitectureHyperparameter.MAX_SEQ_LENGTH.value]),
        ),
    )

    if os.path.exists(cached_features_file) is True:
        logging.info(
            "deleting local cache file: {}".format(cached_features_file))
        os.remove(cached_features_file)

    download_cache_from_s3(args_dict, evaluate)

    if os.path.exists(cached_features_file) and args_dict[
            RunParameters.OVERWRTIE_CACHE.value] is False:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)

        try:
            features, dataset, examples = (
                features_and_dataset["features"],
                features_and_dataset["dataset"],
                features_and_dataset["examples"],
            )
        except KeyError:
            raise DeprecationWarning(
                "You seem to be loading features from an older version of this script please delete the "
                "file %s in order for it to be created again" %
                cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        processor = SquadV2Processor() if args_dict[
            RunParameters.VERSION_2.value] else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(
                args_dict["data_dir"],
                filename=args_dict[RunParameters.PREDICT_FILE.value])
        else:
            examples = processor.get_train_examples(
                args_dict["data_dir"],
                filename=args_dict[RunParameters.TRAIN_FILE.value])

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args_dict[
                SquadArchitectureHyperparameter.MAX_SEQ_LENGTH.value],
            doc_stride=args_dict[
                SquadArchitectureHyperparameter.DOC_STRIDE.value],
            max_query_length=args_dict[
                SquadArchitectureHyperparameter.MAX_QUERY_LENGTH.value],
            is_training=not evaluate,
            return_dataset="pt")

        if args_dict[RunParameters.LOCAL_RANK.value] in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(
                {
                    "features": features,
                    "dataset": dataset,
                    "examples": examples
                }, cached_features_file)

    if args_dict[RunParameters.LOCAL_RANK.value] == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
示例#23
0
    async def train(self, sources: Sources):
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.parent.config.tokenizer_name
            if self.parent.config.tokenizer_name else
            self.parent.config.model_name_or_path,
            do_lower_case=self.parent.config.do_lower_case,
            cache_dir=self.parent.config.cache_dir
            if self.parent.config.cache_dir else None,
        )
        self.model = AutoModelForQuestionAnswering.from_pretrained(
            self.parent.config.model_name_or_path,
            from_tf=self.parent.config.from_tf,
            config=config,
            cache_dir=self.parent.config.cache_dir
            if self.parent.config.cache_dir else None,
        )

        if self.parent.config.local_rank == 0:
            # Make sure only the first process in distributed training will download model & vocab
            torch.distributed.barrier()

        self.model.to(self.parent.config.device)

        if self.parent.config.fp16:
            try:
                import apex

                apex.amp.register_half_function(torch, "einsum")
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

        # Training
        train_examples = await self._preprocess_data(sources)
        _, train_dataset = squad_convert_examples_to_features(
            examples=train_examples,
            tokenizer=self.tokenizer,
            max_seq_length=self.parent.config.max_seq_length,
            doc_stride=self.parent.config.doc_stride,
            max_query_length=self.parent.config.max_query_length,
            is_training=True,
            return_dataset="pt",
        )
        global_step, tr_loss = await self._custom_train(train_dataset)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

        # Save the trained model and the tokenizer
        if (self.parent.config.local_rank == -1
                or torch.distributed.get_rank() == 0):
            # Create output directory if needed
            if not os.path.exists(self.parent.config.output_dir
                                  ) and self.parent.config.local_rank in [
                                      -1, 0
                                  ]:
                os.makedirs(self.parent.config.output_dir)

            logger.info("Saving model checkpoint to %s",
                        self.parent.config.output_dir)
            # Save a trained model, configuration and tokenizer using `save_pretrained()`.
            # Take care of distributed/parallel training
            model_to_save = (self.model.module if hasattr(
                self.model, "module") else self.model)
            model_to_save.save_pretrained(self.parent.config.output_dir)
            self.tokenizer.save_pretrained(self.parent.config.output_dir)

            # save training arguments together with the trained model
            torch.save(
                self.parent.config,
                os.path.join(self.parent.config.output_dir,
                             "training_args.bin"),
            )
示例#24
0
 torch.manual_seed(seed)
 torch.cuda.manual_seed_all(seed)
 model_type = "bert"
 shortcut = "bert-base-uncased"
 config = transformers.AutoConfig.from_pretrained(shortcut)
 tokenizer = transformers.AutoTokenizer.from_pretrained(shortcut)
 model = transformers.AutoModelForQuestionAnswering.from_pretrained(
     shortcut, config=config)
 model.to(device)
 # read in data
 processor = SquadV2Processor()
 trainData = processor.get_train_examples(".")
 features, trainDataset = transformers.squad_convert_examples_to_features(
     examples=trainData,
     tokenizer=tokenizer,
     max_seq_length=maxSequenceLength,
     max_query_length=maxQueryLength,
     doc_stride=documentStride,
     return_dataset="pt",
     is_training=True)
 # build up model
 batchSize = 12
 trainSampler = torch.utils.data.RandomSampler(trainDataset)
 trainDataloader = torch.utils.data.DataLoader(trainDataset,
                                               sampler=trainSampler,
                                               batch_size=batchSize)
 trainEpoch = 10
 learningRate = 3e-5
 totalTrainingStep = len(trainDataloader) // trainEpoch
 optimizer = transformers.AdamW(
     params=[i for next, i in model.named_parameters()], lr=learningRate)
 scheduler = transformers.get_linear_schedule_with_warmup(
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Prepare Question-Answering task
    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForQuestionAnswering.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    # Get datasets
    if data_args.use_tfds:
        if data_args.version_2_with_negative:
            logger.warn(
                "tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically"
            )

        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError(
                "If not data_dir is specified, tensorflow_datasets needs to be installed."
            )

        tfds_examples = tfds.load("squad", data_dir=data_args.data_dir)
        train_examples = (SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=False) if training_args.do_train else None)
        eval_examples = (SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=True) if training_args.do_eval else None)
    else:
        processor = SquadV2Processor(
        ) if data_args.version_2_with_negative else SquadV1Processor()
        train_examples = processor.get_train_examples(
            data_args.data_dir) if training_args.do_train else None
        eval_examples = processor.get_dev_examples(
            data_args.data_dir) if training_args.do_eval else None

    train_dataset = (squad_convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        doc_stride=data_args.doc_stride,
        max_query_length=data_args.max_query_length,
        is_training=True,
        return_dataset="tf",
    ) if training_args.do_train else None)

    train_dataset = train_dataset.apply(
        tf.data.experimental.assert_cardinality(len(train_examples)))

    eval_dataset = (squad_convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        doc_stride=data_args.doc_stride,
        max_query_length=data_args.max_query_length,
        is_training=False,
        return_dataset="tf",
    ) if training_args.do_eval else None)

    eval_dataset = eval_dataset.apply(
        tf.data.experimental.assert_cardinality(len(eval_examples)))

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)
示例#26
0
    def __call__(self, *texts, **kwargs) -> Iterator[Answer]:
        """
        Args:
            We support multiple use-cases, the following are exclusive:
            X: sequence of SquadExample
            data: sequence of SquadExample
            question: (str, List[str]), batch of question(s) to map along with context
            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
        Returns:
            dict: {'answer': str, 'score": float, 'start": int, "end": int}
            answer: the textual answer in the initial context
            score: the score the current answer scored for the model
            start: the character index in the original string corresponding to the beginning of the answer' span
            end: the character index in the original string corresponding to the ending of the answer' span
        """
        # Set defaults values
        kwargs.setdefault("topk", 1)
        kwargs.setdefault("doc_stride", 128)
        kwargs.setdefault("max_answer_len", 15)
        kwargs.setdefault("max_seq_len", 384)
        kwargs.setdefault("max_question_len", 64)
        kwargs.setdefault("version_2_with_negative", False)
        kwargs.setdefault("batch_size", 1)
        kwargs.setdefault("threads", 1)
        kwargs.setdefault(
            "min_score", None
        )  # It has priority over "topk" and it doesn't apply to the null answer.
        kwargs.setdefault("sort_mode", DEFAULT_SORT_MODE)

        if kwargs["topk"] < 1:
            raise ValueError(
                f"topk parameter should be >= 1 (got {kwargs['topk']})")

        if kwargs["max_answer_len"] < 1:
            raise ValueError(
                f"max_answer_len parameter should be >= 1 (got {kwargs['max_answer_len']})"
            )

        if kwargs["sort_mode"] not in SORT_MODE_CHOICES:
            raise ValueError(
                f"sort_mode parameter should be in {SORT_MODE_CHOICES} (got {kwargs['sort_mode']})"
            )
        sort_with_prob = kwargs["sort_mode"] == "prob"

        # Convert inputs to features
        examples = self._args_parser(*texts, **kwargs)
        features_list_flat = squad_convert_examples_to_features(
            examples,
            self.tokenizer,
            kwargs["max_seq_len"],
            kwargs["doc_stride"],
            kwargs["max_question_len"],
            False,
            threads=kwargs["threads"],
        )

        start_logits_flat = np.empty(
            (len(features_list_flat), kwargs["max_seq_len"]))
        end_logits_flat = np.empty_like(start_logits_flat)

        max_batch_size = kwargs["batch_size"]

        for batch_idx, features_batch in enumerate(
                chunks(features_list_flat, max_batch_size)):
            kwargs_as_lists = self.inputs_for_model(
                [f.__dict__ for f in features_batch])
            # Manage tensor allocation on correct device
            with self.device_placement():
                if self.framework == "tf":
                    raise ValueError("tf not supported")
                else:
                    with torch.no_grad():
                        kwargs_as_tensors = {
                            k: torch.tensor(v,
                                            device=self.device,
                                            dtype=torch.int64)
                            for k, v in kwargs_as_lists.items()
                        }
                        start_indices, end_indices = self.model(
                            **kwargs_as_tensors)

                        batch_start_idx = batch_idx * max_batch_size
                        batch_end_idx = (batch_idx + 1) * max_batch_size
                        start_logits_flat[
                            batch_start_idx:batch_end_idx] = start_indices.cpu(
                            ).numpy()
                        end_logits_flat[
                            batch_start_idx:batch_end_idx] = end_indices.cpu(
                            ).numpy()

        # Don't convert into (batch_size, max_features_len, max_seq_length)
        # because there may be a very long doc (with a lot of features; i.e., max_features_len may be very large).

        indices_and_features_iterable = groupby(enumerate(features_list_flat),
                                                lambda t: t[1].example_index)
        for example, (_, indices_and_features) in zip(
                examples, indices_and_features_iterable):
            indices, features = zip(*indices_and_features)

            char_to_word = np.array(example.char_to_word_offset)

            start_logits, end_logits = start_logits_flat[
                indices, :], end_logits_flat[indices, :]

            # Normalize logits and spans to retrieve the answer
            start_probs = scipy.special.softmax(start_logits, axis=1)
            end_probs = scipy.special.softmax(end_logits, axis=1)

            # Mask padding and question
            p_mask = np.array([feature.p_mask for feature in features])
            p_bool_mask = p_mask == 1
            start_logits[p_bool_mask], end_logits[
                p_bool_mask] = -np.inf, -np.inf
            start_probs[p_bool_mask], end_probs[p_bool_mask] = 0, 0

            if kwargs["version_2_with_negative"]:
                null_answer = Answer(
                    instance=example,
                    text="",
                    prob=(start_probs[:, 0] * end_probs[:, 0]).min().item(),
                    logit=(start_logits[:, 0] +
                           start_logits[:, 0]).min().item(),
                    start=0,
                    sort_mode=kwargs["sort_mode"],
                )
            else:
                null_answer = None

            start_probs[:, 0] = end_probs[:, 0] = 0
            start_logits[:, 0] = end_logits[:, 0] = -np.inf

            if sort_with_prob:
                start_scores, end_scores = start_probs, end_probs
            else:
                start_scores, end_scores = start_logits, end_logits

            # We increase the top-k because there can be repeated answers here
            # (e.g., they start in different tokens of the same word).
            feature_indices, start_indices, end_indices = self.decode(
                start_scores, end_scores, 5 * kwargs["topk"],
                kwargs["max_answer_len"], sort_with_prob)

            # Convert the answer (tokens) back to the original text
            answers = (
                Answer(
                    instance=example,
                    text=" ".join(
                        example.doc_tokens[features[f].token_to_orig_map[s]:
                                           features[f].token_to_orig_map[e] +
                                           1]),
                    prob=(start_probs[f, s] * end_probs[f, e]).item(),
                    logit=(start_logits[f, s] + end_logits[f, e]).item(),
                    start=np.where(char_to_word == features[f].
                                   token_to_orig_map[s])[0][0].item(),
                    null_answer=null_answer,
                    sort_mode=kwargs["sort_mode"],
                )
                for f, s, e in zip(feature_indices, start_indices, end_indices)
                # If topk is very large, it may be the case that some invalid answer was selected:
                if s in features[f].token_to_orig_map
                and e in features[f].token_to_orig_map)

            # We leave the unique answers.
            # An answer in considered non-unique if it's inside another one.
            #
            # Note that if they have the same text and they're in the same positions then only one will be kept.
            # If they have the same text but are in different positions we leave them, and this is good
            # (i.e., it's like having more evidence).
            #
            # We use `(start, end)` to uniquely identify the answer (note they answer the same `example`).
            unique_answers_by_start_and_end: MutableMapping[Tuple[int, int],
                                                            Answer] = {}
            for answer in answers:
                # We iterate with copy because we may delete items.
                for start_and_end, another_answer in list(
                        unique_answers_by_start_and_end.items()):
                    if answer.one_inside_the_other_one(another_answer):
                        if answer.sort_score > another_answer.sort_score:
                            # TODO: we could make a new answer with the widest answer (keeping the largest score).
                            #   Or other strategies with overlapping answers.
                            del unique_answers_by_start_and_end[start_and_end]
                            unique_answers_by_start_and_end[(
                                answer.start, answer.end)] = answer
                        break
                else:
                    unique_answers_by_start_and_end[(answer.start,
                                                     answer.end)] = answer

            unique_answers = [
                answer for answer in unique_answers_by_start_and_end.values()
                if kwargs["min_score"] is None
                or answer.sort_score >= kwargs["min_score"]
            ]

            if kwargs["version_2_with_negative"]:
                unique_answers.append(null_answer)

            for answer in sorted(unique_answers,
                                 key=lambda a: a.sort_score,
                                 reverse=True)[:kwargs["topk"]]:
                yield answer
示例#27
0
文件: squad2.py 项目: yf1291/nlp4
    def load_dataset(self, split, combine=False, **kwargs):
        cache = os.path.join(
            self.args.data,
            "cached_{}_{}_{}.pth".format(split, self.args.bpe,
                                         self.args.max_seq_len))

        if os.path.exists(cache):
            examples, features = torch.load(cache)
        else:
            if split == 'valid':
                examples = self.processor.get_dev_examples(
                    self.args.data, self.train_or_dev_file[split])
            else:
                examples = self.processor.get_train_examples(
                    self.args.data, self.train_or_dev_file[split])

            features = squad_convert_examples_to_features(
                examples=examples,
                tokenizer=self.tokenizer,
                max_seq_length=self.args.max_seq_len,
                doc_stride=128,
                max_query_length=64,
                is_training=(split != 'valid'),
                return_dataset=False,
            )

            if self.args.distributed_rank == 0:
                torch.save((examples, features), cache)

        if split == 'valid' and self.do_evaluate:
            self.examples = examples
            self.features = features

        src_dataset = BaseWrapperDataset(
            [np.array(f.input_ids) for f in features])
        starts = BaseWrapperDataset(
            np.array([f.start_position for f in features]))
        ends = BaseWrapperDataset(np.array([f.end_position for f in features]))
        sizes = np.array([len(f.input_ids) for f in features])
        src_lengths = NumelDataset(src_dataset, reduce=False)
        '''
            Input format: <s> question here ? </s> Passage </s>
        '''

        dataset = NestedDictionaryDataset(
            {
                'id': IdDataset(),
                'net_input': {
                    'src_tokens': src_dataset,
                    'src_lengths': NumelDataset(src_dataset, reduce=False),
                },
                'targets': {
                    'starts': starts,
                    'ends': ends,
                },
                'nsentences': NumSamplesDataset(),
                'ntokens': NumelDataset(src_dataset, reduce=True),
            },
            sizes=[sizes],
        )

        with data_utils.numpy_seed(self.args.seed):
            dataset = SortDataset(
                dataset,
                sort_order=[np.random.permutation(len(dataset))],
            )

        print('| Loaded {} with {} samples'.format(split, len(dataset)))

        self.datasets[split] = dataset
        return self.datasets[split]
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    # COMET model setting up
    device = "0"
    comet_model = "pretrained_models/atomic_pretrained_model.pickle"
    sampling_algo = "beam-2"
    opt, state_dict = interactive.load_model_file(comet_model)

    data_loader, text_encoder = interactive.load_data("atomic", opt)

    n_ctx = data_loader.max_event + data_loader.max_effect
    n_vocab = len(text_encoder.encoder) + n_ctx
    model = interactive.make_model(opt, n_vocab, n_ctx, state_dict)
    nlp = spacy.load("en_core_web_sm")

    if device != "cpu":
        cfg.device = int(device)
        cfg.do_gpu = True
        torch.cuda.set_device(cfg.device)
        model.cuda(cfg.device)
    else:
        cfg.device = "cpu"

    sampling_algorithm = sampling_algo

    sampler = interactive.set_sampler(opt, sampling_algorithm, data_loader)

    def augment(article):
        context = (article.numpy().decode('UTF-8'))

        category_list = ["xNeed", "xIntent", "xWant", "xReact"]

        for category in category_list:

            entity_list = nlp(context)
            input_event = context
            replaced = []
            replacement_list = ["PersonX", "PersonY", "PersonZ"]
            r = 0
            for entity in entity_list.ents:
                if entity.label_ == 'PERSON' or entity.label_ == 'NORP':
                    input_event = input_event.replace(entity.text,
                                                      replacement_list[r])
                    r += 1
                    if (r == 3):
                        break

            outputs = interactive.get_atomic_sequence(input_event, model,
                                                      sampler, data_loader,
                                                      text_encoder, category)

            for key in outputs:

                prefix = ""
                if (key[0] == "o"):
                    if (key == "oEffect"):
                        prefix = " Everyone else "
                    elif (key == "oReact"):
                        prefix = "They are "
                    elif (key == "oWant"):
                        prefix = "They want "
                else:
                    if (len(replaced) != 0):
                        prefix = replaced[0]
                    else:
                        prefix = "Person"
                    if (key == "xAttr"):
                        prefix += " is "
                    elif (key == "xEffect"):
                        prefix += " "
                    elif (key == "xIntent"):
                        prefix += " intends "
                    elif (key == "xReact"):
                        prefix += " is "
                    elif (key == "xNeed"):
                        prefix += " needs "
                    elif (key == "xWant"):
                        prefix += " wants "

                for j in range(5):

                    if (outputs[key]["beams"][j] != 'none'):
                        comet_inf = outputs[key]["beams"][j]
                        if (len(replaced) > 0):
                            comet_inf = comet_inf.replace(
                                "personx", replaced[0])
                            if (len(replaced) > 1):
                                comet_inf = comet_inf.replace(
                                    "persony", replaced[1])

                        article += prefix + (comet_inf) + ". "
                        break

        return article

    def process_example(example):

        example['context'] = tf.py_function(func=augment,
                                            inp=[example['context']],
                                            Tout=tf.string)
        return example

    ## End

    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or
                                  (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            tfds_examples["train"] = tfds_examples["train"].map(
                lambda x: process_example(x))
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor(
            ) if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(
                    args.data_dir, filename=args.predict_file)
            else:
                examples = processor.get_train_examples(
                    args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(
                {
                    "features": features,
                    "dataset": dataset,
                    "examples": examples
                }, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
示例#29
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    # Load data features from cache or dataset file
    input_dir = args.cache_dir if args.cache_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or
                                  (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor(
            ) if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(
                    os.path.join(args.data_dir, args.task),
                    filename=args.predict_file)
            else:
                examples = processor.get_train_examples(
                    os.path.join(args.data_dir, args.task),
                    filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(
            {
                "features": features,
                "dataset": dataset,
                "examples": examples
            }, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
from transformers import AutoTokenizer, squad_convert_examples_to_features

output_dir = "data/SQuAD/SQuAD_dev2"
model_name = "albert-base-v1"
data_dir = "data/SQuAD"
dev_fn = "dev-v2.0.json"

tokenizer = AutoTokenizer.from_pretrained(model_name)

processor = SquadV2Processor()

examples = processor.get_train_examples(data_dir, filename=dev_fn)

features, dataset = squad_convert_examples_to_features(
    examples=examples,
    tokenizer=tokenizer,
    max_seq_length=384,
    doc_stride=128,
    max_query_length=64,
    is_training=True,
    return_dataset="pt",
    threads=2,
)

for i, filename_base in enumerate(["input_ids", "attention_mask", "token_type_ids", "start_positions", "end_positions"]):
    np.savetxt(f"{output_dir}/{filename_base}.csv", dataset.tensors[i].numpy(), delimiter=",")

with open(f"{output_dir}/strings.txt", "w") as fio:
    for t in dataset.tensors[0]:
        fio.write(f"{tokenizer.decode(t)}\n")