def _initialize_data_loaders(self): if self.distributed: sampler_train = DistributedSampler(self.data["train"]) else: sampler_train = RandomSampler(self.data["train"]) data_loader_train = NamedDataLoader( dataset=self.data["train"], sampler=sampler_train, batch_size=self.batch_size, tensor_names=self.tensor_names, ) data_loader_dev = NamedDataLoader( dataset=self.data["dev"], sampler=SequentialSampler(self.data["dev"]), batch_size=self.batch_size, tensor_names=self.tensor_names, ) if self.processor.test_filename: data_loader_test = NamedDataLoader( dataset=self.data["test"], sampler=SequentialSampler(self.data["test"]), batch_size=self.batch_size, tensor_names=self.tensor_names, ) else: data_loader_test = None self.loaders = { "train": data_loader_train, "dev": data_loader_dev, "test": data_loader_test, }
def _initialize_data_loaders(self): """ Initializing train, dev and test data loaders for the already loaded datasets """ if self.data["dev"] is not None: data_loader_dev = NamedDataLoader( dataset=self.data["dev"], sampler=SequentialSampler(self.data["dev"]), batch_size=self.batch_size, tensor_names=self.tensor_names, ) else: data_loader_dev = None if self.processor.test_filename: data_loader_test = NamedDataLoader( dataset=self.data["test"], sampler=SequentialSampler(self.data["test"]), batch_size=self.batch_size, tensor_names=self.tensor_names, ) else: data_loader_test = None self.loaders = { "dev": data_loader_dev, "test": data_loader_test, }
def __init__(self, origsilo, trainset, devset, testset): self.tensor_names = origsilo.tensor_names self.data = {"train": trainset, "dev": devset, "test": testset} self.processor = origsilo.processor self.batch_size = origsilo.batch_size # should not be necessary, xval makes no sense with huge data # sampler_train = DistributedSampler(self.data["train"]) sampler_train = RandomSampler(trainset) self.data_loader_train = NamedDataLoader( dataset=trainset, sampler=sampler_train, batch_size=self.batch_size, tensor_names=self.tensor_names, ) self.data_loader_dev = NamedDataLoader( dataset=devset, sampler=SequentialSampler(devset), batch_size=self.batch_size, tensor_names=self.tensor_names, ) self.data_loader_test = NamedDataLoader( dataset=testset, sampler=SequentialSampler(testset), batch_size=self.batch_size, tensor_names=self.tensor_names, ) self.loaders = { "train": self.data_loader_train, "dev": self.data_loader_dev, "test": self.data_loader_test, }
def run_inference(self, dicts): dataset, tensor_names = self.processor.dataset_from_dicts(dicts) samples = [] for dict in dicts: samples.extend(self.processor._dict_to_samples(dict)) data_loader = NamedDataLoader( dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names, ) preds_all = [] for batch in data_loader: batch = {key: batch[key].to(self.device) for key in batch} with torch.no_grad(): logits = self.model.forward(**batch) preds = self.model.formatted_preds( logits=logits, label_maps=self.processor.label_maps, samples= samples, # TODO slice these samples to be in line with the batched input tokenizer=self.processor.tokenizer, **batch) preds_all += preds return preds_all
def _get_predictions(self, dataset, tensor_names, baskets): """ Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting). :param dataset: PyTorch Dataset with samples you want to predict :param tensor_names: Names of the tensors in the dataset :param baskets: For each item in the dataset, we need additional information to create formatted preds. Baskets contain all relevant infos for that. Example: QA - input string to convert the predicted answer from indices back to string space :return: list of predictions """ samples = [s for b in baskets for s in b.samples] data_loader = NamedDataLoader( dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names ) preds_all = [] for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing Samples", unit=" Batches", disable=self.disable_tqdm)): batch = {key: batch[key].to(self.device) for key in batch} batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size] # get logits with torch.no_grad(): logits = self.model.forward(**batch)[0] preds = self.model.formatted_preds( logits=[logits], samples=batch_samples, tokenizer=self.processor.tokenizer, return_class_probs=self.return_class_probs, **batch) preds_all += preds return preds_all
def _run_inference(self, dataset, tensor_names, baskets, rest_api_schema=False): samples = [s for b in baskets for s in b.samples] data_loader = NamedDataLoader( dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names ) logits_all = [] preds_all = [] aggregate_preds = hasattr(self.model.prediction_heads[0], "aggregate_preds") for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing")): batch = {key: batch[key].to(self.device) for key in batch} if not aggregate_preds: batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size] with torch.no_grad(): logits = self.model.forward(**batch)[0] if not aggregate_preds: preds = self.model.formatted_preds( logits=[logits], samples=batch_samples, tokenizer=self.processor.tokenizer, return_class_probs=self.return_class_probs, rest_api_schema=rest_api_schema, **batch) preds_all += preds else: logits_all += [l for l in logits] if aggregate_preds: # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk # TODO is there a better way than having to wrap logits all in list? # TODO can QA formatted preds deal with samples? preds_all = self.model.formatted_preds(logits=[logits_all], baskets=baskets, rest_api_schema=rest_api_schema)[0] return preds_all
def extract_vectors(self, dicts, extraction_strategy="pooled"): """ Converts a text into vector(s) using the language model only (no prediction head involved). :param dicts: Samples to run inference on provided as a list of dicts. One dict per sample. :type dicts: [dict] :param extraction_strategy: Strategy to extract vectors. Choices: 'pooled' (sentence vector), 'per_token' (individual token vectors) :type extraction_strategy: str :return: dict of predictions """ dataset, tensor_names = self.processor.dataset_from_dicts(dicts) samples = [] for dict in dicts: samples.extend(self.processor._dict_to_samples(dict)) data_loader = NamedDataLoader( dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names, ) preds_all = [] for i,batch in enumerate(data_loader): batch = {key: batch[key].to(self.device) for key in batch} batch_samples = samples[i*self.batch_size:(i+1)*self.batch_size] with torch.no_grad(): preds = self.model.language_model.formatted_preds( extraction_strategy=extraction_strategy, samples=batch_samples, tokenizer=self.processor.tokenizer, **batch ) preds_all += preds return preds_all
def get_data_loader(self, dataset_name): """ Returns a new instance of dataloader for the given dataset. The dataloader lazily yields from Iterable DataSets. After a complete iteration over the input data, the generators gets exhausted. So, for instance, in the case of model training, a new train dataloader must be used for each train epoch. :param dataset_name: 'train', 'dev', or 'test' set. :type dataset_name: str """ filename = None if dataset_name == "train": filename = self.processor.train_filename elif dataset_name == "dev": if self.processor.dev_split > 0.0: raise NotImplemented( "StreamingDataSilo does not have dev_split implemented. " "To use dev data, supply a dev filename when creating the Processor." ) elif self.processor.dev_filename: filename = self.processor.dev_filename elif dataset_name == "test": if self.processor.test_filename: filename = self.processor.test_filename if not filename: return None # Batching: # # The model Trainer is passed a PyTorch DataLoader instance that yields dataset batches for training. # # By default, the PyTorch DataLoader prefetch (2 * num_workers) samples. However, given the higher # batch sizes(usually >64) for model training, the default prefetch is not sufficient to keep the # model Training saturated with datasets. # # As a workaround, we yield batches of samples instead of yielding individual samples. The DataLoader # can then prefetch (2 * num_workers) number of batches of samples. # # Since the batching is now handled within _StreamingDataSet, we disable the batching on DataLoader side # by initializing the data loader with batch_size as 1. if isinstance(filename, Path) and filename.is_dir(): filepath = filename else: filepath = self.processor.data_dir / filename data_set = _StreamingDataSet( processor=self.processor, filepath=filepath, batch_size=self.batch_size, dataloader_workers=self.dataloader_workers, distributed = self.distributed ) data_loader = NamedDataLoader( dataset=data_set, batch_size=1, num_workers=self.dataloader_workers, pin_memory=True ) return data_loader
def _get_predictions_and_aggregate(self, dataset, tensor_names, baskets): """ Feed a preprocessed dataset to the model and get the actual predictions (forward pass + logits_to_preds + formatted_preds). Difference to _get_predictions(): - Additional aggregation step across predictions of individual samples (e.g. For QA on long texts, we extract answers from multiple passages and then aggregate them on the "document level") :param dataset: PyTorch Dataset with samples you want to predict :param tensor_names: Names of the tensors in the dataset :param baskets: For each item in the dataset, we need additional information to create formatted preds. Baskets contain all relevant infos for that. Example: QA - input string to convert the predicted answer from indices back to string space :param rest_api_schema: Whether input dicts use the format that complies with the FARM REST API. Currently only used for QA to switch from squad to a more useful format in production. While input is almost the same, output contains additional meta data(offset, context..) :type rest_api_schema: bool :return: list of predictions """ data_loader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names) # TODO Sometimes this is the preds of one head, sometimes of two. We need a more advanced stacking operation # TODO so that preds of the right shape are passed in to formatted_preds unaggregated_preds_all = [] for i, batch in enumerate( tqdm(data_loader, desc=f"Inferencing Samples", unit=" Batches", disable=self.disable_tqdm)): batch = {key: batch[key].to(self.device) for key in batch} # get logits with torch.no_grad(): # Aggregation works on preds, not logits. We want as much processing happening in one batch + on GPU # So we transform logits to preds here as well logits = self.model.forward(**batch) # preds = self.model.logits_to_preds(logits, **batch)[0] (This must somehow be useful for SQuAD) preds = self.model.logits_to_preds(logits, **batch) unaggregated_preds_all.append(preds) # In some use cases we want to aggregate the individual predictions. # This is mostly useful, if the input text is longer than the max_seq_len that the model can process. # In QA we can use this to get answers from long input texts by first getting predictions for smaller passages # and then aggregating them here. # At this point unaggregated preds has shape [n_batches][n_heads][n_samples] # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk logits = [None] preds_all = self.model.formatted_preds( logits= logits, # For QA we collected preds per batch and do not want to pass logits preds_p=unaggregated_preds_all, baskets=baskets) return preds_all
def _get_predictions(self, dicts): """ Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting). :param dicts: list of dictionaries examples:[{'query': "where is florida?"}, {'query': "who wrote lord of the rings?"}, ...] [{'passages': [{ "title": 'Big Little Lies (TV series)', "text": 'series garnered several accolades. It received..', "label": 'positive', "external_id": '18768923'}, {"title": 'Framlingham Castle', "text": 'Castle on the Hill "Castle on the Hill" is a song by English..', "label": 'positive', "external_id": '19930582'}, ...] :return: dictionary of embeddings for "passages" and "query" """ dataset, tensor_names, _, baskets = self.processor.dataset_from_dicts( dicts, indices=[i for i in range(len(dicts))], return_baskets=True) data_loader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names) all_embeddings = {"query": [], "passages": []} self.model.eval() # When running evaluations etc., we don't want a progress bar for every single query if len(dataset) == 1: disable_tqdm = True else: disable_tqdm = not self.progress_bar for i, batch in enumerate( tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=disable_tqdm)): batch = {key: batch[key].to(self.device) for key in batch} # get logits with torch.no_grad(): query_embeddings, passage_embeddings = self.model.forward( **batch)[0] if query_embeddings is not None: all_embeddings["query"].append( query_embeddings.cpu().numpy()) if passage_embeddings is not None: all_embeddings["passages"].append( passage_embeddings.cpu().numpy()) if all_embeddings["passages"]: all_embeddings["passages"] = np.concatenate( all_embeddings["passages"]) if all_embeddings["query"]: all_embeddings["query"] = np.concatenate(all_embeddings["query"]) return all_embeddings
def _get_predictions(self, dataset, tensor_names, baskets, rest_api_schema=False): """ Feed the preprocessed dataset to the model and get the actual predictions""" samples = [s for b in baskets for s in b.samples] data_loader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names) unaggregated_preds_all = [] preds_all = [] aggregate_preds = hasattr(self.model.prediction_heads[0], "aggregate_preds") for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing")): batch = {key: batch[key].to(self.device) for key in batch} if not aggregate_preds: batch_samples = samples[i * self.batch_size:(i + 1) * self.batch_size] # get logits with torch.no_grad(): if aggregate_preds: # Aggregation works on preds, not logits. We want as much processing happening in one batch + on GPU # So we transform logits to preds here as well logits = self.model.forward(**batch) preds = self.model.logits_to_preds(logits, **batch)[0] unaggregated_preds_all += preds else: logits = self.model.forward(**batch)[0] preds = self.model.formatted_preds( logits=[logits], samples=batch_samples, tokenizer=self.processor.tokenizer, return_class_probs=self.return_class_probs, rest_api_schema=rest_api_schema, **batch) preds_all += preds # In some use cases we want to aggregate the individual predictions. # This is mostly useful, if the input text is longer than the max_seq_len that the model can process. # In QA we can use this to get answers from long input texts by first getting predictions for smaller passages # and then aggregating them here. if aggregate_preds: # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk preds_all = self.model.formatted_preds( logits=[ None ], # For QA we collected preds per batch and do not want to pass logits preds_p=unaggregated_preds_all, baskets=baskets, rest_api_schema=rest_api_schema)[0] return preds_all
def _initialize_data_loaders(self): """ Initializing train, dev and test data loaders for the already loaded datasets """ if self.data["train"] is not None: if self.distributed: sampler_train = DistributedSampler(self.data["train"]) else: sampler_train = RandomSampler(self.data["train"]) data_loader_train = NamedDataLoader( dataset=self.data["train"], sampler=sampler_train, batch_size=self.batch_size, tensor_names=self.tensor_names, ) else: data_loader_train = None if self.data["dev"] is not None: data_loader_dev = NamedDataLoader( dataset=self.data["dev"], sampler=SequentialSampler(self.data["dev"]), batch_size=self.eval_batch_size, tensor_names=self.tensor_names, ) else: data_loader_dev = None if self.data["test"] is not None: data_loader_test = NamedDataLoader( dataset=self.data["test"], sampler=SequentialSampler(self.data["test"]), batch_size=self.eval_batch_size, tensor_names=self.tensor_names, ) else: data_loader_test = None self.loaders = { "train": data_loader_train, "dev": data_loader_dev, "test": data_loader_test, }
def _create_dataloader(self, text_to_encode: List[dict]) -> NamedDataLoader: dataset, tensor_names, _ = self.processor.dataset_from_dicts( text_to_encode, indices=[i for i in range(len(text_to_encode))]) dataloader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=32, tensor_names=tensor_names) return dataloader
def _get_predictions(self, dataset, tensor_names, baskets, rest_api_schema=False): """ Feed the preprocessed dataset to the model and get the actual predictions""" samples = [s for b in baskets for s in b.samples] data_loader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names) logits_all = [] preds_all = [] aggregate_preds = hasattr(self.model.prediction_heads[0], "aggregate_preds") for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing")): batch = {key: batch[key].to(self.device) for key in batch} if not aggregate_preds: batch_samples = samples[i * self.batch_size:(i + 1) * self.batch_size] # get logits with torch.no_grad(): logits = self.model.forward(**batch)[0] # either just stack the logits (and convert later to readable predictions) if aggregate_preds: logits_all += [l for l in logits] # or convert directly else: preds = self.model.formatted_preds( logits=[logits], samples=batch_samples, tokenizer=self.processor.tokenizer, return_class_probs=self.return_class_probs, rest_api_schema=rest_api_schema, **batch) preds_all += preds # In some use cases we want to aggregate the individual predictions. # This is mostly useful, if the input text is longer than the max_seq_len that the model can process. # In QA we can use this to get answers from long input texts by first getting predictions for smaller passages # and then aggregating them here. if aggregate_preds: # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk # TODO is there a better way than having to wrap logits all in list? # TODO can QA formatted preds deal with samples? preds_all = self.model.formatted_preds( logits=[logits_all], baskets=baskets, rest_api_schema=rest_api_schema)[0] return preds_all
def _get_predictions(self, dataset, tensor_names, baskets, rest_api_schema=False, disable_tqdm=False): """ Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting). :param dataset: PyTorch Dataset with samples you want to predict :param tensor_names: Names of the tensors in the dataset :param baskets: For each item in the dataset, we need additional information to create formatted preds. Baskets contain all relevant infos for that. Example: QA - input string to convert the predicted answer from indices back to string space :param rest_api_schema: Whether input dicts use the format that complies with the FARM REST API. Currently only used for QA to switch from squad to a more useful format in production. While input is almost the same, output contains additional meta data(offset, context..) :type rest_api_schema: bool :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) :type disable_tqdm: bool :return: list of predictions """ samples = [s for b in baskets for s in b.samples] data_loader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names) preds_all = [] for i, batch in enumerate( tqdm(data_loader, desc=f"Inferencing Samples", unit=" Batches", disable=disable_tqdm)): batch = {key: batch[key].to(self.device) for key in batch} batch_samples = samples[i * self.batch_size:(i + 1) * self.batch_size] # get logits with torch.no_grad(): logits = self.model.forward(**batch)[0] preds = self.model.formatted_preds( logits=[logits], samples=batch_samples, tokenizer=self.processor.tokenizer, rest_api_schema=rest_api_schema, return_class_probs=self.return_class_probs, **batch) preds_all += preds return preds_all
def _get_predictions(self, dataset, tensor_names, baskets): samples = [s for b in baskets for s in b.samples] data_loader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names) preds_all = [] for i, batch in enumerate( tqdm(data_loader, desc="Inferencing Samples", unit=" Batches", disable=self.disable_tqdm)): batch = {key: batch[key].to(self.device) for key in batch} batch_samples = samples[i * self.batch_size:(i + 1) * self.batch_size] # Two fundamental differences with original: # 1) Use all logits for all heads instead of taking only the first one # 2) Passes logits as the list that already is and not as element of list with torch.no_grad(): logits = self.model.forward(**batch) preds = self.model.formatted_preds( logits=logits, samples=batch_samples, tokenizer=self.processor.tokenizer, return_class_probs=self.return_class_probs, **batch, ) if self.level == "token": if i == 0: preds_all = preds else: for task_dict in preds: preds_all_dict_id = [ idx for idx, dic in enumerate(preds_all) if dic["task"] == task_dict["task"] ] if len(preds_all_dict_id) != 1: raise AttributeError( "Task type must be present a single time.") idx = preds_all_dict_id[0] preds_all[idx]["predictions"] += task_dict[ "predictions"] else: preds_all += preds if self.level == "token": preds_all = format_multitask_preds(preds_all) return preds_all
def _get_predictions(self, dicts, tokenizer): """ Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting). :param dicts: list of dictionaries examples:[{'query': "where is florida?"}, {'query': "who wrote lord of the rings?"}, ...] [{'passages': [{ "title": 'Big Little Lies (TV series)', "text": 'series garnered several accolades. It received..', "label": 'positive', "external_id": '18768923'}, {"title": 'Framlingham Castle', "text": 'Castle on the Hill "Castle on the Hill" is a song by English..', "label": 'positive', "external_id": '19930582'}, ...] :return: dictionary of embeddings for "passages" and "query" """ dataset, tensor_names, baskets = self.processor.dataset_from_dicts( dicts, indices=[i for i in range(len(dicts))], return_baskets=True) data_loader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names) all_embeddings = { "query": torch.tensor([]).to(self.device), "passages": torch.tensor([]).to(self.device) } self.model.eval() for i, batch in enumerate( tqdm(data_loader, desc=f"Inferencing Samples", unit=" Batches", disable=False)): batch = {key: batch[key].to(self.device) for key in batch} # get logits with torch.no_grad(): query_embeddings, passage_embeddings = self.model.forward( **batch)[0] all_embeddings["query"] = torch.cat((all_embeddings["query"], query_embeddings), dim=0) \ if isinstance(query_embeddings, torch.Tensor) else None all_embeddings["passages"] = torch.cat((all_embeddings["passages"], passage_embeddings), dim=0) \ if isinstance(passage_embeddings, torch.Tensor) else None # convert embeddings to numpy array for k, v in all_embeddings.items(): all_embeddings[k] = v.cpu().numpy() if v != None else None return all_embeddings
def extract_vectors(self, dicts, extraction_strategy="cls_token", extraction_layer=-1): """ Converts a text into vector(s) using the language model only (no prediction head involved). Example: basic_texts = [{"text": "Some text we want to embed"}, {"text": "And a second one"}] result = inferencer.extract_vectors(dicts=basic_texts) :param dicts: Samples to run inference on provided as a list of dicts. One dict per sample. :type dicts: [dict] :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean' (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors) :type extraction_strategy: str :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer). :type: int :return: dict of predictions """ dataset, tensor_names = self.processor.dataset_from_dicts( dicts, rest_api_schema=True) samples = [] for dict in dicts: samples.extend(self.processor._dict_to_samples(dict)) data_loader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names) preds_all = [] for i, batch in enumerate(data_loader): batch = {key: batch[key].to(self.device) for key in batch} batch_samples = samples[i * self.batch_size:(i + 1) * self.batch_size] with torch.no_grad(): preds = self.model.language_model.formatted_preds( extraction_strategy=extraction_strategy, samples=batch_samples, tokenizer=self.processor.tokenizer, extraction_layer=extraction_layer, **batch, ) preds_all += preds return preds_all
def predict(self, dicts): """ This function is a simple modification of the MLMInferencer/Inferencer's run_inference method (located at farm/infer.py) except that it uses a custom processor which does not mask the input (which is already masked when running prediction). :param dicts: Masked samples to run prediction on provided as a list of dicts. One dict per sample. :type dicst: [dict] :return: dict of predictions """ pred_processor = CharMLMPredProcessor( tokenizer=self.processor.tokenizer, max_seq_len=self.processor.max_seq_len, data_dir=self.processor.data_dir, ) dataset, tensor_names = pred_processor.dataset_from_dicts(dicts) samples = [] for dict in dicts: samples.extend(pred_processor._dict_to_samples(dict, dicts)) data_loader = NamedDataLoader( dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names, ) preds_all = [] for i, batch in enumerate(data_loader): batch = {key: batch[key].to(self.device) for key in batch} batch_samples = samples[i * self.batch_size:(i + 1) * self.batch_size] with torch.no_grad(): logits = self.model.forward(**batch) preds = self.model.formatted_preds( logits=logits, label_maps=pred_processor.label_maps, samples=batch_samples, tokenizer=pred_processor.tokenizer, **batch, ) preds_all.append(preds) # flatten list preds_all = [ p for outer_list in preds_all for pred_dict in outer_list for p in pred_dict ] return preds_all
def _run_inference_qa(self, concatdataset, tensor_names, samples): data_loader = NamedDataLoader(dataset=concatdataset, sampler=SequentialSampler(concatdataset), batch_size=self.batch_size, tensor_names=tensor_names) all_preds = [] for batch in tqdm(data_loader, desc=f"Inferencing"): batch = {key: batch[key].to(self.device) for key in batch} with torch.no_grad(): logits = self.model.forward(**batch) preds = self.model.logits_to_preds(logits=logits, **batch) all_preds += preds preds_all = self.model.prediction_heads[0].formatted_preds( logits=None, preds=all_preds, samples=samples) return [preds_all]
def run_inference(self, dicts): """ Runs down-stream inference using the prediction head. :param dicts: Samples to run inference on provided as a list of dicts. One dict per sample. :type dicst: [dict] :return: dict of predictions """ if self.prediction_type == "embedder": raise TypeError( "You have called run_inference for a model without any prediction head! " "If you want to: " "a) ... extract vectors from the language model: call `Inferencer.extract_vectors(...)`" f"b) ... run inference on a downstream task: make sure your model path {self.name} contains a saved prediction head" ) dataset, tensor_names = self.processor.dataset_from_dicts(dicts, from_inference=True) samples = [] for dict in dicts: samples.extend(self.processor._dict_to_samples(dict)) data_loader = NamedDataLoader( dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names, ) preds_all = [] for i, batch in enumerate(data_loader): batch = {key: batch[key].to(self.device) for key in batch} batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size] with torch.no_grad(): logits = self.model.forward(**batch) preds = self.model.formatted_preds( logits=logits, samples=batch_samples, tokenizer=self.processor.tokenizer, **batch, ) preds_all += preds return preds_all
def get_data_loader(self, dataset_name): """ Returns a new instance of dataloader for the given dataset. The dataloader lazily yields from Iterable DataSets. After a complete iteration over the input data, the generators gets exhausted. So, for instance, in the case of model training, a new train dataloader must be used for each train epoch. :param dataset_name: 'train', 'dev', or 'test' set. :type dataset_name: str """ if dataset_name == "train": filename = self.processor.train_filename # Batching: # # The model Trainer is passed a PyTorch DataLoader instance that yields dataset batches for training. # # By default, the PyTorch DataLoader prefetch (2 * num_workers) samples. However, given the higher # batch sizes(usually >64) for model training, the default prefetch is not sufficient to keep the # model Training saturated with datasets. # # As a workaround, we yield batches of samples instead of yielding individual samples. The DataLoader # can then prefetch (2 * num_workers) number of batches of samples. # # Since the batching is now handled within _StreamingDataSet, we disable the batching on DataLoader side # by initializing the data loader with batch_size as 1. data_set = _StreamingDataSet( processor=self.processor, filepath=self.processor.data_dir / filename, batch_size=self.batch_size, dataloader_workers=self.max_processes, ) data_loader = NamedDataLoader(dataset=data_set, batch_size=1, num_workers=self.max_processes, pin_memory=True) return data_loader else: return self.loaders[dataset_name]
def _run_inference(self, dataset, tensor_names, samples): data_loader = NamedDataLoader( dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names ) preds_all = [] for i, batch in enumerate(data_loader): batch = {key: batch[key].to(self.device) for key in batch} batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size] with torch.no_grad(): logits = self.model.forward(**batch) preds = self.model.formatted_preds( logits=logits, samples=batch_samples, # TODO batch_samples and logits are not aligned tokenizer=self.processor.tokenizer, return_class_probs=self.return_class_probs, **batch, ) preds_all += preds return preds_all
def eval( self, document_store: BaseDocumentStore, device: str, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", ): """ Performs evaluation on evaluation documents in the DocumentStore. Returns a dict containing the following metrics: - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers - "f1": Average overlap between predicted answers and their corresponding correct answers - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer :param document_store: DocumentStore containing the evaluation documents :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :param label_index: Index/Table name where labeled questions are stored :param doc_index: Index/Table name where documents that are used for evaluation are stored """ if self.top_k_per_candidate != 4: logger.info( f"Performing Evaluation using top_k_per_candidate = {self.top_k_per_candidate} \n" f"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n" f"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5" ) # extract all questions for evaluation filters = {"origin": [label_origin]} labels = document_store.get_all_labels(index=label_index, filters=filters) # Aggregate all answer labels per question aggregated_per_doc = defaultdict(list) for label in labels: if not label.document_id: logger.error(f"Label does not contain a document_id") continue aggregated_per_doc[label.document_id].append(label) # Create squad style dicts d: Dict[str, Any] = {} all_doc_ids = [ x.id for x in document_store.get_all_documents(doc_index) ] for doc_id in all_doc_ids: doc = document_store.get_document_by_id(doc_id, index=doc_index) if not doc: logger.error( f"Document with the ID '{doc_id}' is not present in the document store." ) continue d[str(doc_id)] = {"context": doc.text} # get all questions / answers aggregated_per_question: Dict[str, Any] = defaultdict(list) if doc_id in aggregated_per_doc: for label in aggregated_per_doc[doc_id]: # add to existing answers if label.question in aggregated_per_question.keys(): # Hack to fix problem where duplicate questions are merged by doc_store processing creating a QA example with 8 annotations > 6 annotation max if len(aggregated_per_question[label.question] ["answers"]) >= 6: continue aggregated_per_question[ label.question]["answers"].append({ "text": label.answer, "answer_start": label.offset_start_in_doc }) # create new one else: aggregated_per_question[label.question] = { "id": str(hash(str(doc_id) + label.question)), "question": label.question, "answers": [{ "text": label.answer, "answer_start": label.offset_start_in_doc }] } # Get rid of the question key again (after we aggregated we don't need it anymore) d[str(doc_id)]["qas"] = [ v for v in aggregated_per_question.values() ] # Convert input format for FARM farm_input = [v for v in d.values()] n_queries = len([y for x in farm_input for y in x["qas"]]) # Create DataLoader that can be passed to the Evaluator tic = perf_counter() indices = range(len(farm_input)) dataset, tensor_names = self.inferencer.processor.dataset_from_dicts( farm_input, indices=indices) data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names) evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) toc = perf_counter() reader_time = toc - tic results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_accuracy": eval_results[0]["top_n_accuracy"], "top_n": self.inferencer.model.prediction_heads[0].n_best, "reader_time": reader_time, "seconds_per_query": reader_time / n_queries } return results
def eval( self, document_store: BaseDocumentStore, device: str, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", ): """ Performs evaluation on evaluation documents in the DocumentStore. Returns a dict containing the following metrics: - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers - "f1": Average overlap between predicted answers and their corresponding correct answers - "top_n_accuracy": Proportion of predicted answers that match with correct answer :param document_store: DocumentStore containing the evaluation documents :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :param label_index: Index/Table name where labeled questions are stored :param doc_index: Index/Table name where documents that are used for evaluation are stored """ # extract all questions for evaluation filters = {"origin": [label_origin]} labels = document_store.get_all_labels(index=label_index, filters=filters) # Aggregate all answer labels per question aggregated_per_doc = defaultdict(list) for label in labels: if not label.document_id: logger.error(f"Label does not contain a document_id") continue aggregated_per_doc[label.document_id].append(label) # Create squad style dicts d: Dict[str, Any] = {} for doc_id in aggregated_per_doc.keys(): doc = document_store.get_document_by_id(doc_id, index=doc_index) if not doc: logger.error( f"Document with the ID '{doc_id}' is not present in the document store." ) continue d[str(doc_id)] = {"context": doc.text} # get all questions / answers aggregated_per_question: Dict[str, Any] = defaultdict(list) for label in aggregated_per_doc[doc_id]: # add to existing answers if label.question in aggregated_per_question.keys(): aggregated_per_question[label.question]["answers"].append({ "text": label.answer, "answer_start": label.offset_start_in_doc }) # create new one else: aggregated_per_question[label.question] = { "id": str(hash(str(doc_id) + label.question)), "question": label.question, "answers": [{ "text": label.answer, "answer_start": label.offset_start_in_doc }] } # Get rid of the question key again (after we aggregated we don't need it anymore) d[str(doc_id)]["qas"] = [ v for v in aggregated_per_question.values() ] # Convert input format for FARM farm_input = [v for v in d.values()] # Create DataLoader that can be passed to the Evaluator indices = range(len(farm_input)) dataset, tensor_names = self.inferencer.processor.dataset_from_dicts( farm_input, indices=indices) data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names) evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_accuracy": eval_results[0]["top_n_accuracy"] } return results
def eval(self, document_store: ElasticsearchDocumentStore, device: str, label_index: str = "feedback", doc_index: str = "eval_document", label_origin: str = "gold_label"): """ Performs evaluation on evaluation documents in Elasticsearch DocumentStore. Returns a dict containing the following metrics: - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers - "f1": Average overlap between predicted answers and their corresponding correct answers - "top_n_recall": Proportion of predicted answers that overlap with correct answer :param document_store: The ElasticsearchDocumentStore containing the evaluation documents :type document_store: ElasticsearchDocumentStore :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :type device: str :param label_index: Elasticsearch index where labeled questions are stored :type label_index: str :param doc_index: Elasticsearch index where documents that are used for evaluation are stored :type doc_index: str """ # extract all questions for evaluation filter = {"origin": label_origin} questions = document_store.get_all_documents_in_index( index=label_index, filters=filter) # mapping from doc_id to questions doc_questions_dict = {} id = 0 for question in questions: doc_id = question["_source"]["doc_id"] if doc_id not in doc_questions_dict: doc_questions_dict[doc_id] = [{ "id": id, "question": question["_source"]["question"], "answers": question["_source"]["answers"], "is_impossible": False if question["_source"]["answers"] else True }] else: doc_questions_dict[doc_id].append({ "id": id, "question": question["_source"]["question"], "answers": question["_source"]["answers"], "is_impossible": False if question["_source"]["answers"] else True }) id += 1 # extract eval documents and convert data back to SQuAD-like format documents = document_store.get_all_documents_in_index(index=doc_index) dicts = [] for document in documents: doc_id = document["_source"]["doc_id"] text = document["_source"]["text"] questions = doc_questions_dict[doc_id] dicts.append({"qas": questions, "context": text}) # Create DataLoader that can be passed to the Evaluator indices = range(len(dicts)) dataset, tensor_names = self.inferencer.processor.dataset_from_dicts( dicts, indices=indices) data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names) evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_recall": eval_results[0]["top_n_recall"] } return results