示例#1
0
    def _initialize_data_loaders(self):
        if self.distributed:
            sampler_train = DistributedSampler(self.data["train"])
        else:
            sampler_train = RandomSampler(self.data["train"])

        data_loader_train = NamedDataLoader(
            dataset=self.data["train"],
            sampler=sampler_train,
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )

        data_loader_dev = NamedDataLoader(
            dataset=self.data["dev"],
            sampler=SequentialSampler(self.data["dev"]),
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )

        if self.processor.test_filename:
            data_loader_test = NamedDataLoader(
                dataset=self.data["test"],
                sampler=SequentialSampler(self.data["test"]),
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_test = None

        self.loaders = {
            "train": data_loader_train,
            "dev": data_loader_dev,
            "test": data_loader_test,
        }
    def _initialize_data_loaders(self):
        """ Initializing train, dev and test data loaders for the already loaded datasets """

        if self.data["dev"] is not None:
            data_loader_dev = NamedDataLoader(
                dataset=self.data["dev"],
                sampler=SequentialSampler(self.data["dev"]),
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_dev = None

        if self.processor.test_filename:
            data_loader_test = NamedDataLoader(
                dataset=self.data["test"],
                sampler=SequentialSampler(self.data["test"]),
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_test = None

        self.loaders = {
            "dev": data_loader_dev,
            "test": data_loader_test,
        }
示例#3
0
    def __init__(self, origsilo, trainset, devset, testset):
        self.tensor_names = origsilo.tensor_names
        self.data = {"train": trainset, "dev": devset, "test": testset}
        self.processor = origsilo.processor
        self.batch_size = origsilo.batch_size
        # should not be necessary, xval makes no sense with huge data
        # sampler_train = DistributedSampler(self.data["train"])
        sampler_train = RandomSampler(trainset)

        self.data_loader_train = NamedDataLoader(
            dataset=trainset,
            sampler=sampler_train,
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )
        self.data_loader_dev = NamedDataLoader(
            dataset=devset,
            sampler=SequentialSampler(devset),
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )
        self.data_loader_test = NamedDataLoader(
            dataset=testset,
            sampler=SequentialSampler(testset),
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )
        self.loaders = {
            "train": self.data_loader_train,
            "dev": self.data_loader_dev,
            "test": self.data_loader_test,
        }
示例#4
0
文件: infer.py 项目: leiframming/FARM
    def run_inference(self, dicts):
        dataset, tensor_names = self.processor.dataset_from_dicts(dicts)
        samples = []
        for dict in dicts:
            samples.extend(self.processor._dict_to_samples(dict))

        data_loader = NamedDataLoader(
            dataset=dataset,
            sampler=SequentialSampler(dataset),
            batch_size=self.batch_size,
            tensor_names=tensor_names,
        )

        preds_all = []
        for batch in data_loader:
            batch = {key: batch[key].to(self.device) for key in batch}

            with torch.no_grad():
                logits = self.model.forward(**batch)
                preds = self.model.formatted_preds(
                    logits=logits,
                    label_maps=self.processor.label_maps,
                    samples=
                    samples,  # TODO slice these samples to be in line with the batched input
                    tokenizer=self.processor.tokenizer,
                    **batch)
                preds_all += preds

        return preds_all
示例#5
0
    def _get_predictions(self, dataset, tensor_names, baskets):
        """
        Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting).

        :param dataset: PyTorch Dataset with samples you want to predict
        :param tensor_names: Names of the tensors in the dataset
        :param baskets: For each item in the dataset, we need additional information to create formatted preds.
                        Baskets contain all relevant infos for that.
                        Example: QA - input string to convert the predicted answer from indices back to string space
        :return: list of predictions
        """
        samples = [s for b in baskets for s in b.samples]

        data_loader = NamedDataLoader(
            dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names
        )
        preds_all = []
        for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing Samples", unit=" Batches", disable=self.disable_tqdm)):
            batch = {key: batch[key].to(self.device) for key in batch}
            batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size]

            # get logits
            with torch.no_grad():
                logits = self.model.forward(**batch)[0]
                preds = self.model.formatted_preds(
                    logits=[logits],
                    samples=batch_samples,
                    tokenizer=self.processor.tokenizer,
                    return_class_probs=self.return_class_probs,
                    **batch)
                preds_all += preds
        return preds_all
示例#6
0
文件: infer.py 项目: yizhiwan/FARM
    def _run_inference(self, dataset, tensor_names, baskets, rest_api_schema=False):
        samples = [s for b in baskets for s in b.samples]

        data_loader = NamedDataLoader(
            dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names
        )
        logits_all = []
        preds_all = []
        aggregate_preds = hasattr(self.model.prediction_heads[0], "aggregate_preds")
        for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing")):
            batch = {key: batch[key].to(self.device) for key in batch}
            if not aggregate_preds:
                batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size]
            with torch.no_grad():
                logits = self.model.forward(**batch)[0]
                if not aggregate_preds:
                    preds = self.model.formatted_preds(
                        logits=[logits],
                        samples=batch_samples,
                        tokenizer=self.processor.tokenizer,
                        return_class_probs=self.return_class_probs,
                        rest_api_schema=rest_api_schema,
                        **batch)
                    preds_all += preds
                else:
                    logits_all += [l for l in logits]
        if aggregate_preds:
            # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk
            # TODO is there a better way than having to wrap logits all in list?
            # TODO can QA formatted preds deal with samples?
            preds_all = self.model.formatted_preds(logits=[logits_all],
                                                   baskets=baskets,
                                                   rest_api_schema=rest_api_schema)[0]
        return preds_all
示例#7
0
    def extract_vectors(self, dicts, extraction_strategy="pooled"):
        """
        Converts a text into vector(s) using the language model only (no prediction head involved).
        :param dicts: Samples to run inference on provided as a list of dicts. One dict per sample.
        :type dicts: [dict]
        :param extraction_strategy: Strategy to extract vectors. Choices: 'pooled' (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :return: dict of predictions
        """
        dataset, tensor_names = self.processor.dataset_from_dicts(dicts)
        samples = []
        for dict in dicts:
            samples.extend(self.processor._dict_to_samples(dict))

        data_loader = NamedDataLoader(
            dataset=dataset,
            sampler=SequentialSampler(dataset),
            batch_size=self.batch_size,
            tensor_names=tensor_names,
        )

        preds_all = []
        for i,batch in enumerate(data_loader):
            batch = {key: batch[key].to(self.device) for key in batch}
            batch_samples = samples[i*self.batch_size:(i+1)*self.batch_size]
            with torch.no_grad():
                preds = self.model.language_model.formatted_preds(
                    extraction_strategy=extraction_strategy,
                    samples=batch_samples,
                    tokenizer=self.processor.tokenizer,
                    **batch
                )
                preds_all += preds

        return preds_all
示例#8
0
    def get_data_loader(self, dataset_name):
        """
        Returns a new instance of dataloader for the given dataset.

        The dataloader lazily yields from Iterable DataSets. After a complete iteration
        over the input data, the generators gets exhausted. So, for instance, in the 
        case of model training, a new train dataloader must be used for each train epoch.

        :param dataset_name: 'train', 'dev', or 'test' set.
        :type dataset_name: str
        """
        filename = None
        if dataset_name == "train":
            filename = self.processor.train_filename
        elif dataset_name == "dev":
            if self.processor.dev_split > 0.0:
                raise NotImplemented(
                            "StreamingDataSilo does not have dev_split implemented. "
                            "To use dev data, supply a dev filename when creating the Processor."
                )
            elif self.processor.dev_filename:
                filename = self.processor.dev_filename
        elif dataset_name == "test":
            if self.processor.test_filename:
                filename = self.processor.test_filename

        if not filename:
            return None

        #  Batching:
        #
        #  The model Trainer is passed a PyTorch DataLoader instance that yields dataset batches for training.
        #
        #  By default, the PyTorch DataLoader prefetch (2 * num_workers) samples. However, given the higher
        #  batch sizes(usually >64) for model training, the default prefetch is not sufficient to keep the
        #  model Training saturated with datasets.
        #
        #  As a workaround, we yield batches of samples instead of yielding individual samples. The DataLoader
        #  can then prefetch (2 * num_workers) number of batches of samples.
        #
        #  Since the batching is now handled within _StreamingDataSet, we disable the batching on DataLoader side
        #  by initializing the data loader with batch_size as 1.

        if isinstance(filename, Path) and filename.is_dir():
            filepath = filename
        else:
            filepath = self.processor.data_dir / filename

        data_set = _StreamingDataSet(
            processor=self.processor,
            filepath=filepath,
            batch_size=self.batch_size,
            dataloader_workers=self.dataloader_workers,
            distributed = self.distributed
        )

        data_loader = NamedDataLoader(
            dataset=data_set, batch_size=1, num_workers=self.dataloader_workers, pin_memory=True
        )
        return data_loader
示例#9
0
文件: infer.py 项目: yon606/FARM
    def _get_predictions_and_aggregate(self, dataset, tensor_names, baskets):
        """
        Feed a preprocessed dataset to the model and get the actual predictions (forward pass + logits_to_preds + formatted_preds).

        Difference to _get_predictions():
         - Additional aggregation step across predictions of individual samples
         (e.g. For QA on long texts, we extract answers from multiple passages and then aggregate them on the "document level")

        :param dataset: PyTorch Dataset with samples you want to predict
        :param tensor_names: Names of the tensors in the dataset
        :param baskets: For each item in the dataset, we need additional information to create formatted preds.
                        Baskets contain all relevant infos for that.
                        Example: QA - input string to convert the predicted answer from indices back to string space
        :param rest_api_schema: Whether input dicts use the format that complies with the FARM REST API.
                                Currently only used for QA to switch from squad to a more useful format in production.
                                While input is almost the same, output contains additional meta data(offset, context..)
        :type rest_api_schema: bool
        :return: list of predictions
        """

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)
        # TODO Sometimes this is the preds of one head, sometimes of two. We need a more advanced stacking operation
        # TODO so that preds of the right shape are passed in to formatted_preds
        unaggregated_preds_all = []

        for i, batch in enumerate(
                tqdm(data_loader,
                     desc=f"Inferencing Samples",
                     unit=" Batches",
                     disable=self.disable_tqdm)):

            batch = {key: batch[key].to(self.device) for key in batch}

            # get logits
            with torch.no_grad():
                # Aggregation works on preds, not logits. We want as much processing happening in one batch + on GPU
                # So we transform logits to preds here as well
                logits = self.model.forward(**batch)
                # preds = self.model.logits_to_preds(logits, **batch)[0] (This must somehow be useful for SQuAD)
                preds = self.model.logits_to_preds(logits, **batch)
                unaggregated_preds_all.append(preds)

        # In some use cases we want to aggregate the individual predictions.
        # This is mostly useful, if the input text is longer than the max_seq_len that the model can process.
        # In QA we can use this to get answers from long input texts by first getting predictions for smaller passages
        # and then aggregating them here.

        # At this point unaggregated preds has shape [n_batches][n_heads][n_samples]

        # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk
        logits = [None]
        preds_all = self.model.formatted_preds(
            logits=
            logits,  # For QA we collected preds per batch and do not want to pass logits
            preds_p=unaggregated_preds_all,
            baskets=baskets)
        return preds_all
示例#10
0
    def _get_predictions(self, dicts):
        """
        Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting).

        :param dicts: list of dictionaries
        examples:[{'query': "where is florida?"}, {'query': "who wrote lord of the rings?"}, ...]
                [{'passages': [{
                    "title": 'Big Little Lies (TV series)',
                    "text": 'series garnered several accolades. It received..',
                    "label": 'positive',
                    "external_id": '18768923'},
                    {"title": 'Framlingham Castle',
                    "text": 'Castle on the Hill "Castle on the Hill" is a song by English..',
                    "label": 'positive',
                    "external_id": '19930582'}, ...]
        :return: dictionary of embeddings for "passages" and "query"
        """

        dataset, tensor_names, _, baskets = self.processor.dataset_from_dicts(
            dicts, indices=[i for i in range(len(dicts))], return_baskets=True)

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)
        all_embeddings = {"query": [], "passages": []}
        self.model.eval()

        # When running evaluations etc., we don't want a progress bar for every single query
        if len(dataset) == 1:
            disable_tqdm = True
        else:
            disable_tqdm = not self.progress_bar

        for i, batch in enumerate(
                tqdm(data_loader,
                     desc=f"Creating Embeddings",
                     unit=" Batches",
                     disable=disable_tqdm)):
            batch = {key: batch[key].to(self.device) for key in batch}

            # get logits
            with torch.no_grad():
                query_embeddings, passage_embeddings = self.model.forward(
                    **batch)[0]
                if query_embeddings is not None:
                    all_embeddings["query"].append(
                        query_embeddings.cpu().numpy())
                if passage_embeddings is not None:
                    all_embeddings["passages"].append(
                        passage_embeddings.cpu().numpy())

        if all_embeddings["passages"]:
            all_embeddings["passages"] = np.concatenate(
                all_embeddings["passages"])
        if all_embeddings["query"]:
            all_embeddings["query"] = np.concatenate(all_embeddings["query"])
        return all_embeddings
示例#11
0
文件: infer.py 项目: wwmmqq/FARM
    def _get_predictions(self,
                         dataset,
                         tensor_names,
                         baskets,
                         rest_api_schema=False):
        """ Feed the preprocessed dataset to the model and get the actual predictions"""
        samples = [s for b in baskets for s in b.samples]

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)
        unaggregated_preds_all = []
        preds_all = []
        aggregate_preds = hasattr(self.model.prediction_heads[0],
                                  "aggregate_preds")
        for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing")):
            batch = {key: batch[key].to(self.device) for key in batch}

            if not aggregate_preds:
                batch_samples = samples[i * self.batch_size:(i + 1) *
                                        self.batch_size]

            # get logits
            with torch.no_grad():
                if aggregate_preds:
                    # Aggregation works on preds, not logits. We want as much processing happening in one batch + on GPU
                    # So we transform logits to preds here as well
                    logits = self.model.forward(**batch)
                    preds = self.model.logits_to_preds(logits, **batch)[0]
                    unaggregated_preds_all += preds
                else:
                    logits = self.model.forward(**batch)[0]
                    preds = self.model.formatted_preds(
                        logits=[logits],
                        samples=batch_samples,
                        tokenizer=self.processor.tokenizer,
                        return_class_probs=self.return_class_probs,
                        rest_api_schema=rest_api_schema,
                        **batch)
                    preds_all += preds

        # In some use cases we want to aggregate the individual predictions.
        # This is mostly useful, if the input text is longer than the max_seq_len that the model can process.
        # In QA we can use this to get answers from long input texts by first getting predictions for smaller passages
        # and then aggregating them here.
        if aggregate_preds:
            # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk
            preds_all = self.model.formatted_preds(
                logits=[
                    None
                ],  # For QA we collected preds per batch and do not want to pass logits
                preds_p=unaggregated_preds_all,
                baskets=baskets,
                rest_api_schema=rest_api_schema)[0]
        return preds_all
示例#12
0
    def _initialize_data_loaders(self):
        """ Initializing train, dev and test data loaders for the already loaded datasets """

        if self.data["train"] is not None:
            if self.distributed:
                sampler_train = DistributedSampler(self.data["train"])
            else:
                sampler_train = RandomSampler(self.data["train"])

            data_loader_train = NamedDataLoader(
                dataset=self.data["train"],
                sampler=sampler_train,
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_train = None

        if self.data["dev"] is not None:
            data_loader_dev = NamedDataLoader(
                dataset=self.data["dev"],
                sampler=SequentialSampler(self.data["dev"]),
                batch_size=self.eval_batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_dev = None

        if self.data["test"] is not None:
            data_loader_test = NamedDataLoader(
                dataset=self.data["test"],
                sampler=SequentialSampler(self.data["test"]),
                batch_size=self.eval_batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_test = None

        self.loaders = {
            "train": data_loader_train,
            "dev": data_loader_dev,
            "test": data_loader_test,
        }
示例#13
0
文件: dense.py 项目: stmnk/haystack
    def _create_dataloader(self,
                           text_to_encode: List[dict]) -> NamedDataLoader:

        dataset, tensor_names, _ = self.processor.dataset_from_dicts(
            text_to_encode, indices=[i for i in range(len(text_to_encode))])
        dataloader = NamedDataLoader(dataset=dataset,
                                     sampler=SequentialSampler(dataset),
                                     batch_size=32,
                                     tensor_names=tensor_names)
        return dataloader
示例#14
0
文件: infer.py 项目: shafiahmed/FARM
    def _get_predictions(self,
                         dataset,
                         tensor_names,
                         baskets,
                         rest_api_schema=False):
        """ Feed the preprocessed dataset to the model and get the actual predictions"""
        samples = [s for b in baskets for s in b.samples]

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)
        logits_all = []
        preds_all = []
        aggregate_preds = hasattr(self.model.prediction_heads[0],
                                  "aggregate_preds")
        for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing")):
            batch = {key: batch[key].to(self.device) for key in batch}
            if not aggregate_preds:
                batch_samples = samples[i * self.batch_size:(i + 1) *
                                        self.batch_size]

            # get logits
            with torch.no_grad():
                logits = self.model.forward(**batch)[0]

                # either just stack the logits (and convert later to readable predictions)
                if aggregate_preds:
                    logits_all += [l for l in logits]

                # or convert directly
                else:
                    preds = self.model.formatted_preds(
                        logits=[logits],
                        samples=batch_samples,
                        tokenizer=self.processor.tokenizer,
                        return_class_probs=self.return_class_probs,
                        rest_api_schema=rest_api_schema,
                        **batch)
                    preds_all += preds

        # In some use cases we want to aggregate the individual predictions.
        # This is mostly useful, if the input text is longer than the max_seq_len that the model can process.
        # In QA we can use this to get answers from long input texts by first getting predictions for smaller passages
        # and then aggregating them here.
        if aggregate_preds:
            # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk
            # TODO is there a better way than having to wrap logits all in list?
            # TODO can QA formatted preds deal with samples?
            preds_all = self.model.formatted_preds(
                logits=[logits_all],
                baskets=baskets,
                rest_api_schema=rest_api_schema)[0]
        return preds_all
示例#15
0
文件: infer.py 项目: fablos/FARM
    def _get_predictions(self,
                         dataset,
                         tensor_names,
                         baskets,
                         rest_api_schema=False,
                         disable_tqdm=False):
        """
        Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting).

        :param dataset: PyTorch Dataset with samples you want to predict
        :param tensor_names: Names of the tensors in the dataset
        :param baskets: For each item in the dataset, we need additional information to create formatted preds.
                        Baskets contain all relevant infos for that.
                        Example: QA - input string to convert the predicted answer from indices back to string space
        :param rest_api_schema: Whether input dicts use the format that complies with the FARM REST API.
                                Currently only used for QA to switch from squad to a more useful format in production.
                                While input is almost the same, output contains additional meta data(offset, context..)
        :type rest_api_schema: bool
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :return: list of predictions
        """
        samples = [s for b in baskets for s in b.samples]

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)
        preds_all = []
        for i, batch in enumerate(
                tqdm(data_loader,
                     desc=f"Inferencing Samples",
                     unit=" Batches",
                     disable=disable_tqdm)):
            batch = {key: batch[key].to(self.device) for key in batch}
            batch_samples = samples[i * self.batch_size:(i + 1) *
                                    self.batch_size]

            # get logits
            with torch.no_grad():
                logits = self.model.forward(**batch)[0]
                preds = self.model.formatted_preds(
                    logits=[logits],
                    samples=batch_samples,
                    tokenizer=self.processor.tokenizer,
                    rest_api_schema=rest_api_schema,
                    return_class_probs=self.return_class_probs,
                    **batch)
                preds_all += preds
        return preds_all
示例#16
0
    def _get_predictions(self, dataset, tensor_names, baskets):
        samples = [s for b in baskets for s in b.samples]

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)
        preds_all = []
        for i, batch in enumerate(
                tqdm(data_loader,
                     desc="Inferencing Samples",
                     unit=" Batches",
                     disable=self.disable_tqdm)):
            batch = {key: batch[key].to(self.device) for key in batch}
            batch_samples = samples[i * self.batch_size:(i + 1) *
                                    self.batch_size]

            # Two fundamental differences with original:
            # 1) Use all logits for all heads instead of taking only the first one
            # 2) Passes logits as the list that already is and not as element of list
            with torch.no_grad():
                logits = self.model.forward(**batch)
                preds = self.model.formatted_preds(
                    logits=logits,
                    samples=batch_samples,
                    tokenizer=self.processor.tokenizer,
                    return_class_probs=self.return_class_probs,
                    **batch,
                )
            if self.level == "token":
                if i == 0:
                    preds_all = preds
                else:
                    for task_dict in preds:
                        preds_all_dict_id = [
                            idx for idx, dic in enumerate(preds_all)
                            if dic["task"] == task_dict["task"]
                        ]
                        if len(preds_all_dict_id) != 1:
                            raise AttributeError(
                                "Task type must be present a single time.")
                        idx = preds_all_dict_id[0]
                        preds_all[idx]["predictions"] += task_dict[
                            "predictions"]
            else:
                preds_all += preds
        if self.level == "token":
            preds_all = format_multitask_preds(preds_all)
        return preds_all
示例#17
0
文件: dense.py 项目: swayson/haystack
    def _get_predictions(self, dicts, tokenizer):
        """
        Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting).

        :param dicts: list of dictionaries
        examples:[{'query': "where is florida?"}, {'query': "who wrote lord of the rings?"}, ...]
                [{'passages': [{
                    "title": 'Big Little Lies (TV series)',
                    "text": 'series garnered several accolades. It received..',
                    "label": 'positive',
                    "external_id": '18768923'},
                    {"title": 'Framlingham Castle',
                    "text": 'Castle on the Hill "Castle on the Hill" is a song by English..',
                    "label": 'positive',
                    "external_id": '19930582'}, ...]
        :return: dictionary of embeddings for "passages" and "query"
        """
        dataset, tensor_names, baskets = self.processor.dataset_from_dicts(
            dicts, indices=[i for i in range(len(dicts))], return_baskets=True)

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)
        all_embeddings = {
            "query": torch.tensor([]).to(self.device),
            "passages": torch.tensor([]).to(self.device)
        }
        self.model.eval()
        for i, batch in enumerate(
                tqdm(data_loader,
                     desc=f"Inferencing Samples",
                     unit=" Batches",
                     disable=False)):
            batch = {key: batch[key].to(self.device) for key in batch}

            # get logits
            with torch.no_grad():
                query_embeddings, passage_embeddings = self.model.forward(
                    **batch)[0]
                all_embeddings["query"] = torch.cat((all_embeddings["query"], query_embeddings), dim=0) \
                                                     if isinstance(query_embeddings, torch.Tensor) else None
                all_embeddings["passages"] = torch.cat((all_embeddings["passages"], passage_embeddings), dim=0) \
                                                    if isinstance(passage_embeddings, torch.Tensor) else None

        # convert embeddings to numpy array
        for k, v in all_embeddings.items():
            all_embeddings[k] = v.cpu().numpy() if v != None else None
        return all_embeddings
示例#18
0
文件: infer.py 项目: wwmmqq/FARM
    def extract_vectors(self,
                        dicts,
                        extraction_strategy="cls_token",
                        extraction_layer=-1):
        """
        Converts a text into vector(s) using the language model only (no prediction head involved).

        Example:
            basic_texts = [{"text": "Some text we want to embed"}, {"text": "And a second one"}]
            result = inferencer.extract_vectors(dicts=basic_texts)

        :param dicts: Samples to run inference on provided as a list of dicts. One dict per sample.
        :type dicts: [dict]
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type: int
        :return: dict of predictions
        """

        dataset, tensor_names = self.processor.dataset_from_dicts(
            dicts, rest_api_schema=True)
        samples = []
        for dict in dicts:
            samples.extend(self.processor._dict_to_samples(dict))

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)

        preds_all = []
        for i, batch in enumerate(data_loader):
            batch = {key: batch[key].to(self.device) for key in batch}
            batch_samples = samples[i * self.batch_size:(i + 1) *
                                    self.batch_size]
            with torch.no_grad():
                preds = self.model.language_model.formatted_preds(
                    extraction_strategy=extraction_strategy,
                    samples=batch_samples,
                    tokenizer=self.processor.tokenizer,
                    extraction_layer=extraction_layer,
                    **batch,
                )
                preds_all += preds

        return preds_all
    def predict(self, dicts):
        """
        This function is a simple modification of the MLMInferencer/Inferencer's run_inference method (located at farm/infer.py) except that it uses a custom processor which does not mask the input (which is already masked when running prediction).
        :param dicts: Masked samples to run prediction on provided as a list of dicts. One dict per sample.
        :type dicst: [dict]
        :return: dict of predictions

        """
        pred_processor = CharMLMPredProcessor(
            tokenizer=self.processor.tokenizer,
            max_seq_len=self.processor.max_seq_len,
            data_dir=self.processor.data_dir,
        )
        dataset, tensor_names = pred_processor.dataset_from_dicts(dicts)
        samples = []
        for dict in dicts:
            samples.extend(pred_processor._dict_to_samples(dict, dicts))

        data_loader = NamedDataLoader(
            dataset=dataset,
            sampler=SequentialSampler(dataset),
            batch_size=self.batch_size,
            tensor_names=tensor_names,
        )

        preds_all = []
        for i, batch in enumerate(data_loader):
            batch = {key: batch[key].to(self.device) for key in batch}
            batch_samples = samples[i * self.batch_size:(i + 1) *
                                    self.batch_size]
            with torch.no_grad():
                logits = self.model.forward(**batch)
                preds = self.model.formatted_preds(
                    logits=logits,
                    label_maps=pred_processor.label_maps,
                    samples=batch_samples,
                    tokenizer=pred_processor.tokenizer,
                    **batch,
                )
                preds_all.append(preds)
        # flatten list
        preds_all = [
            p for outer_list in preds_all for pred_dict in outer_list
            for p in pred_dict
        ]
        return preds_all
示例#20
0
文件: infer.py 项目: cregouby/FARM
    def _run_inference_qa(self, concatdataset, tensor_names, samples):
        data_loader = NamedDataLoader(dataset=concatdataset,
                                      sampler=SequentialSampler(concatdataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)

        all_preds = []
        for batch in tqdm(data_loader, desc=f"Inferencing"):
            batch = {key: batch[key].to(self.device) for key in batch}
            with torch.no_grad():
                logits = self.model.forward(**batch)
                preds = self.model.logits_to_preds(logits=logits, **batch)
                all_preds += preds

        preds_all = self.model.prediction_heads[0].formatted_preds(
            logits=None, preds=all_preds, samples=samples)

        return [preds_all]
示例#21
0
文件: infer.py 项目: onisimchukv/FARM
    def run_inference(self, dicts):
        """
        Runs down-stream inference using the prediction head.

        :param dicts: Samples to run inference on provided as a list of dicts. One dict per sample.
        :type dicst: [dict]
        :return: dict of predictions

        """
        if self.prediction_type == "embedder":
            raise TypeError(
                "You have called run_inference for a model without any prediction head! "
                "If you want to: "
                "a) ... extract vectors from the language model: call `Inferencer.extract_vectors(...)`"
                f"b) ... run inference on a downstream task: make sure your model path {self.name} contains a saved prediction head"
            )
        dataset, tensor_names = self.processor.dataset_from_dicts(dicts, from_inference=True)
        samples = []
        for dict in dicts:
            samples.extend(self.processor._dict_to_samples(dict))

        data_loader = NamedDataLoader(
            dataset=dataset,
            sampler=SequentialSampler(dataset),
            batch_size=self.batch_size,
            tensor_names=tensor_names,
        )

        preds_all = []
        for i, batch in enumerate(data_loader):
            batch = {key: batch[key].to(self.device) for key in batch}
            batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size]
            with torch.no_grad():
                logits = self.model.forward(**batch)
                preds = self.model.formatted_preds(
                    logits=logits,
                    samples=batch_samples,
                    tokenizer=self.processor.tokenizer,
                    **batch,
                )
                preds_all += preds

        return preds_all
    def get_data_loader(self, dataset_name):
        """
        Returns a new instance of dataloader for the given dataset.

        The dataloader lazily yields from Iterable DataSets. After a complete iteration
        over the input data, the generators gets exhausted. So, for instance, in the
        case of model training, a new train dataloader must be used for each train epoch.

        :param dataset_name: 'train', 'dev', or 'test' set.
        :type dataset_name: str
        """
        if dataset_name == "train":
            filename = self.processor.train_filename

            #  Batching:
            #
            #  The model Trainer is passed a PyTorch DataLoader instance that yields dataset batches for training.
            #
            #  By default, the PyTorch DataLoader prefetch (2 * num_workers) samples. However, given the higher
            #  batch sizes(usually >64) for model training, the default prefetch is not sufficient to keep the
            #  model Training saturated with datasets.
            #
            #  As a workaround, we yield batches of samples instead of yielding individual samples. The DataLoader
            #  can then prefetch (2 * num_workers) number of batches of samples.
            #
            #  Since the batching is now handled within _StreamingDataSet, we disable the batching on DataLoader side
            #  by initializing the data loader with batch_size as 1.

            data_set = _StreamingDataSet(
                processor=self.processor,
                filepath=self.processor.data_dir / filename,
                batch_size=self.batch_size,
                dataloader_workers=self.max_processes,
            )
            data_loader = NamedDataLoader(dataset=data_set,
                                          batch_size=1,
                                          num_workers=self.max_processes,
                                          pin_memory=True)
            return data_loader

        else:
            return self.loaders[dataset_name]
示例#23
0
    def _run_inference(self, dataset, tensor_names, samples):
        data_loader = NamedDataLoader(
            dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names
        )

        preds_all = []
        for i, batch in enumerate(data_loader):
            batch = {key: batch[key].to(self.device) for key in batch}
            batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size]
            with torch.no_grad():
                logits = self.model.forward(**batch)
                preds = self.model.formatted_preds(
                    logits=logits,
                    samples=batch_samples,  # TODO batch_samples and logits are not aligned
                    tokenizer=self.processor.tokenizer,
                    return_class_probs=self.return_class_probs,
                    **batch,
                )
                preds_all += preds

        return preds_all
示例#24
0
文件: farm.py 项目: koyeli28/haystack
    def eval(
        self,
        document_store: BaseDocumentStore,
        device: str,
        label_index: str = "label",
        doc_index: str = "eval_document",
        label_origin: str = "gold_label",
    ):
        """
        Performs evaluation on evaluation documents in the DocumentStore.
        Returns a dict containing the following metrics:
              - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers
              - "f1": Average overlap between predicted answers and their corresponding correct answers
              - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer

        :param document_store: DocumentStore containing the evaluation documents
        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
        :param label_index: Index/Table name where labeled questions are stored
        :param doc_index: Index/Table name where documents that are used for evaluation are stored
        """

        if self.top_k_per_candidate != 4:
            logger.info(
                f"Performing Evaluation using top_k_per_candidate = {self.top_k_per_candidate} \n"
                f"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n"
                f"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5"
            )

        # extract all questions for evaluation
        filters = {"origin": [label_origin]}

        labels = document_store.get_all_labels(index=label_index,
                                               filters=filters)

        # Aggregate all answer labels per question
        aggregated_per_doc = defaultdict(list)
        for label in labels:
            if not label.document_id:
                logger.error(f"Label does not contain a document_id")
                continue
            aggregated_per_doc[label.document_id].append(label)

        # Create squad style dicts
        d: Dict[str, Any] = {}
        all_doc_ids = [
            x.id for x in document_store.get_all_documents(doc_index)
        ]
        for doc_id in all_doc_ids:
            doc = document_store.get_document_by_id(doc_id, index=doc_index)
            if not doc:
                logger.error(
                    f"Document with the ID '{doc_id}' is not present in the document store."
                )
                continue
            d[str(doc_id)] = {"context": doc.text}
            # get all questions / answers
            aggregated_per_question: Dict[str, Any] = defaultdict(list)
            if doc_id in aggregated_per_doc:
                for label in aggregated_per_doc[doc_id]:
                    # add to existing answers
                    if label.question in aggregated_per_question.keys():
                        # Hack to fix problem where duplicate questions are merged by doc_store processing creating a QA example with 8 annotations > 6 annotation max
                        if len(aggregated_per_question[label.question]
                               ["answers"]) >= 6:
                            continue
                        aggregated_per_question[
                            label.question]["answers"].append({
                                "text":
                                label.answer,
                                "answer_start":
                                label.offset_start_in_doc
                            })
                    # create new one
                    else:
                        aggregated_per_question[label.question] = {
                            "id":
                            str(hash(str(doc_id) + label.question)),
                            "question":
                            label.question,
                            "answers": [{
                                "text":
                                label.answer,
                                "answer_start":
                                label.offset_start_in_doc
                            }]
                        }
            # Get rid of the question key again (after we aggregated we don't need it anymore)
            d[str(doc_id)]["qas"] = [
                v for v in aggregated_per_question.values()
            ]

        # Convert input format for FARM
        farm_input = [v for v in d.values()]
        n_queries = len([y for x in farm_input for y in x["qas"]])

        # Create DataLoader that can be passed to the Evaluator
        tic = perf_counter()
        indices = range(len(farm_input))
        dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(
            farm_input, indices=indices)
        data_loader = NamedDataLoader(dataset=dataset,
                                      batch_size=self.inferencer.batch_size,
                                      tensor_names=tensor_names)

        evaluator = Evaluator(data_loader=data_loader,
                              tasks=self.inferencer.processor.tasks,
                              device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        toc = perf_counter()
        reader_time = toc - tic
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_accuracy": eval_results[0]["top_n_accuracy"],
            "top_n": self.inferencer.model.prediction_heads[0].n_best,
            "reader_time": reader_time,
            "seconds_per_query": reader_time / n_queries
        }
        return results
示例#25
0
    def eval(
        self,
        document_store: BaseDocumentStore,
        device: str,
        label_index: str = "label",
        doc_index: str = "eval_document",
        label_origin: str = "gold_label",
    ):
        """
        Performs evaluation on evaluation documents in the DocumentStore.

        Returns a dict containing the following metrics:
            - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers
            - "f1": Average overlap between predicted answers and their corresponding correct answers
            - "top_n_accuracy": Proportion of predicted answers that match with correct answer

        :param document_store: DocumentStore containing the evaluation documents
        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
        :param label_index: Index/Table name where labeled questions are stored
        :param doc_index: Index/Table name where documents that are used for evaluation are stored
        """

        # extract all questions for evaluation
        filters = {"origin": [label_origin]}

        labels = document_store.get_all_labels(index=label_index,
                                               filters=filters)

        # Aggregate all answer labels per question
        aggregated_per_doc = defaultdict(list)
        for label in labels:
            if not label.document_id:
                logger.error(f"Label does not contain a document_id")
                continue
            aggregated_per_doc[label.document_id].append(label)

        # Create squad style dicts
        d: Dict[str, Any] = {}
        for doc_id in aggregated_per_doc.keys():
            doc = document_store.get_document_by_id(doc_id, index=doc_index)
            if not doc:
                logger.error(
                    f"Document with the ID '{doc_id}' is not present in the document store."
                )
                continue
            d[str(doc_id)] = {"context": doc.text}
            # get all questions / answers
            aggregated_per_question: Dict[str, Any] = defaultdict(list)
            for label in aggregated_per_doc[doc_id]:
                # add to existing answers
                if label.question in aggregated_per_question.keys():
                    aggregated_per_question[label.question]["answers"].append({
                        "text":
                        label.answer,
                        "answer_start":
                        label.offset_start_in_doc
                    })
                # create new one
                else:
                    aggregated_per_question[label.question] = {
                        "id":
                        str(hash(str(doc_id) + label.question)),
                        "question":
                        label.question,
                        "answers": [{
                            "text": label.answer,
                            "answer_start": label.offset_start_in_doc
                        }]
                    }
            # Get rid of the question key again (after we aggregated we don't need it anymore)
            d[str(doc_id)]["qas"] = [
                v for v in aggregated_per_question.values()
            ]

        # Convert input format for FARM
        farm_input = [v for v in d.values()]

        # Create DataLoader that can be passed to the Evaluator
        indices = range(len(farm_input))
        dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(
            farm_input, indices=indices)
        data_loader = NamedDataLoader(dataset=dataset,
                                      batch_size=self.inferencer.batch_size,
                                      tensor_names=tensor_names)

        evaluator = Evaluator(data_loader=data_loader,
                              tasks=self.inferencer.processor.tasks,
                              device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_accuracy": eval_results[0]["top_n_accuracy"]
        }
        return results
示例#26
0
文件: farm.py 项目: vchulski/haystack
    def eval(self,
             document_store: ElasticsearchDocumentStore,
             device: str,
             label_index: str = "feedback",
             doc_index: str = "eval_document",
             label_origin: str = "gold_label"):
        """
        Performs evaluation on evaluation documents in Elasticsearch DocumentStore.

        Returns a dict containing the following metrics:
            - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers
            - "f1": Average overlap between predicted answers and their corresponding correct answers
            - "top_n_recall": Proportion of predicted answers that overlap with correct answer

        :param document_store: The ElasticsearchDocumentStore containing the evaluation documents
        :type document_store: ElasticsearchDocumentStore
        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
        :type device: str
        :param label_index: Elasticsearch index where labeled questions are stored
        :type label_index: str
        :param doc_index: Elasticsearch index where documents that are used for evaluation are stored
        :type doc_index: str
        """

        # extract all questions for evaluation
        filter = {"origin": label_origin}
        questions = document_store.get_all_documents_in_index(
            index=label_index, filters=filter)

        # mapping from doc_id to questions
        doc_questions_dict = {}
        id = 0
        for question in questions:
            doc_id = question["_source"]["doc_id"]
            if doc_id not in doc_questions_dict:
                doc_questions_dict[doc_id] = [{
                    "id":
                    id,
                    "question":
                    question["_source"]["question"],
                    "answers":
                    question["_source"]["answers"],
                    "is_impossible":
                    False if question["_source"]["answers"] else True
                }]
            else:
                doc_questions_dict[doc_id].append({
                    "id":
                    id,
                    "question":
                    question["_source"]["question"],
                    "answers":
                    question["_source"]["answers"],
                    "is_impossible":
                    False if question["_source"]["answers"] else True
                })
            id += 1

        # extract eval documents and convert data back to SQuAD-like format
        documents = document_store.get_all_documents_in_index(index=doc_index)
        dicts = []
        for document in documents:
            doc_id = document["_source"]["doc_id"]
            text = document["_source"]["text"]
            questions = doc_questions_dict[doc_id]
            dicts.append({"qas": questions, "context": text})

        # Create DataLoader that can be passed to the Evaluator
        indices = range(len(dicts))
        dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(
            dicts, indices=indices)
        data_loader = NamedDataLoader(dataset=dataset,
                                      batch_size=self.inferencer.batch_size,
                                      tensor_names=tensor_names)

        evaluator = Evaluator(data_loader=data_loader,
                              tasks=self.inferencer.processor.tasks,
                              device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_recall": eval_results[0]["top_n_recall"]
        }
        return results