def test_inference(distilbert_nq, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) model, processor = distilbert_nq save_dir = Path("testsave/qa_nq") model.save(save_dir) processor.save(save_dir) inferencer = QAInferencer.load(save_dir, batch_size=2, gpu=False, num_processes=0) assert inferencer is not None qa_format_1 = [ { "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." } ] qa_format_2 = [ { "qas":["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.", } ] result1 = inferencer.inference_from_dicts(dicts=qa_format_1) result2 = inferencer.inference_from_dicts(dicts=qa_format_2) assert result1 == result2
def convert_to_onnx(cls, model_name_or_path, opset_version: int = 11, optimize_for: Optional[str] = None): """ Convert a PyTorch BERT model to ONNX format and write to ./onnx-export dir. The converted ONNX model can be loaded with in the `FARMReader` using the export path as `model_name_or_path` param. Usage: `from haystack.reader.farm import FARMReader FARMReader.convert_to_onnx(model_name_or_path="deepset/bert-base-cased-squad2", optimize_for="gpu_tensor_core") FARMReader(model_name_or_path=Path("onnx-export"))` :param opset_version: ONNX opset version :param optimize_for: Optimize the exported model for a target device. Available options are "gpu_tensor_core" (GPUs with tensor core like V100 or T4), "gpu_without_tensor_core" (most other GPUs), and "cpu". """ inferencer = QAInferencer.load(model_name_or_path, task_type="question_answering") inferencer.model.convert_to_onnx(output_path=Path("onnx-export"), opset_version=opset_version, optimize_for=optimize_for)
def bert_base_squad2(request): model = QAInferencer.load("deepset/bert-base-cased-squad2", task_type="question_answering", batch_size=16, num_processes=0, use_fast=request.param) return model
def bert_base_squad2(request): model = QAInferencer.load( "deepset/minilm-uncased-squad2", task_type="question_answering", batch_size=4, num_processes=0, multithreading_rust=False, use_fast=True # TODO parametrize this to test slow as well ) return model
def test_save_load(distilbert_squad, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) model, processor = distilbert_squad save_dir = Path("testsave/qa_squad") model.save(save_dir) processor.save(save_dir) inferencer = QAInferencer.load(save_dir, batch_size=2, gpu=False, num_processes=0, task_type="question_answering") assert inferencer is not None
def test_qa_confidence(): inferencer = QAInferencer.load("deepset/roberta-base-squad2", task_type="question_answering", batch_size=40, gpu=True) QA_input = [{ "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0] assert np.isclose(result.prediction[0].confidence, 0.990427553653717) assert result.prediction[0].answer == "GameTrailers"
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 2 evaluate_every = 2000 lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("../data/squad20"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-english-qa-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) QA_input = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] model = QAInferencer.load(save_dir, batch_size=40, gpu=True) result = model.inference_from_dicts(dicts=QA_input)[0] pprint.pprint(result) # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk filename = os.path.join(processor.data_dir, processor.dev_filename) result = model.inference_from_file(file=filename, return_json=False) result_squad = [x.to_squad_eval() for x in result] write_squad_predictions(predictions=result_squad, predictions_filename=filename, out_filename="predictions.json")
def __init__( self, model_name_or_path: Union[str, Path], context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: Optional[float] = None, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, ): """ :param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'bert-base-cased', 'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'. See https://huggingface.co/models for full list of available models. :param context_window_size: The size, in characters, of the window around the answer span that is used when displaying the context around the answer. :param batch_size: Number of samples the model receives in one batch for inference. Memory consumption is much lower in inference mode. Recommendation: Increase the batch size to a value so only a single batch is used. :param use_gpu: Whether to use GPU (if available) :param no_ans_boost: How much the no_answer logit is boosted/increased. If set to None (default), disables returning "no answer" predictions. If a negative number, there is a lower chance of "no_answer" being predicted. If a positive number, there is an increased chance of "no_answer" :param top_k_per_candidate: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text). Note that this is not the number of "final answers" you will receive (see `top_k` in FARMReader.predict() or Finder.get_answers() for that) and that FARM includes no_answer in the sorted list of predictions. :param top_k_per_sample: How many answers to extract from each small text passage that the model can process at once (one "candidate doc" is usually split into many smaller "passages"). You usually want a very small value here, as it slows down inference and you don't gain much of quality by having multiple answers from one passage. Note that this is not the number of "final answers" you will receive (see `top_k` in FARMReader.predict() or Finder.get_answers() for that) and that FARM includes no_answer in the sorted list of predictions. :param num_processes: The number of processes for `multiprocessing.Pool`. Set to value of 0 to disable multiprocessing. Set to None to let Inferencer determine optimum number. If you want to debug the Language Model, you might need to disable multiprocessing! :param max_seq_len: Max sequence length of one input text for the model :param doc_stride: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``) """ if no_ans_boost is None: no_ans_boost = 0 self.return_no_answers = False else: self.return_no_answers = True self.top_k_per_candidate = top_k_per_candidate self.inferencer = QAInferencer.load(model_name_or_path, batch_size=batch_size, gpu=use_gpu, task_type="question_answering", max_seq_len=max_seq_len, doc_stride=doc_stride, num_processes=num_processes) self.inferencer.model.prediction_heads[ 0].context_window_size = context_window_size self.inferencer.model.prediction_heads[0].no_ans_boost = no_ans_boost self.inferencer.model.prediction_heads[ 0].n_best = top_k_per_candidate + 1 # including possible no_answer try: self.inferencer.model.prediction_heads[ 0].n_best_per_sample = top_k_per_sample except: logger.warning( "Could not set `top_k_per_sample` in FARM. Please update FARM version." ) self.max_seq_len = max_seq_len self.use_gpu = use_gpu
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_natural_questions") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 1 evaluate_every = 500 lang_model = "deepset/roberta-base-squad2" # start with a model that can already extract answers do_lower_case = False # roberta is a cased model train_filename = "train_medium.jsonl" dev_filename = "dev_medium.jsonl" keep_is_impossible = 0.15 # downsample negative examples after data conversion downsample_context_size = 300 # reduce length of wikipedia articles to relevant part around the answer # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart html_tags = [ "<Th>","</Th>", "<Td>","</Td>", "<Tr>","</Tr>", "<Li>","</Li>", "<P>" ,"</P>", "<Ul>","</Ul>", "<H1>","</H1>", "<H2>","</H2>", "<H3>","</H3>", "<H4>","</H4>", "<H5>", "</H5>", "<Td_colspan=", ] tokenizer.add_tokens(html_tags) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=384, train_filename=train_filename, dev_filename=dev_filename, keep_no_answer=keep_is_impossible, downsample_context_size=downsample_context_size, data_dir=Path("../data/natural_questions"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, caching=True) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model,n_added_tokens=len(html_tags)) # b) and in case of Natural Questions we need two Prediction Heads # one for extractive Question Answering qa_head = QuestionAnsweringHead() # another one for answering yes/no questions or deciding if the given text passage might contain an answer classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) # answer_type_list = ["is_impossible", "span", "yes", "no"] model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/roberta-base-squad2-nq") model.save(save_dir) processor.save(save_dir) # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3 fetch_archive_from_http("https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip", output_dir="../saved_models/farm") QA_input = [ { "qas": ["Did GameTrailers rated Twilight Princess as one of the best games ever created?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." } ] model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True) result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?" f"\nAnswer from model: {result[0].prediction[0].answer}") model.close_multiprocessing_pool()
def __init__(self, model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True): """ :param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'bert-base-cased', 'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'. See https://huggingface.co/models for full list of available models. :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param context_window_size: The size, in characters, of the window around the answer span that is used when displaying the context around the answer. :param batch_size: Number of samples the model receives in one batch for inference. Memory consumption is much lower in inference mode. Recommendation: Increase the batch size to a value so only a single batch is used. :param use_gpu: Whether to use GPU (if available) :param no_ans_boost: How much the no_answer logit is boosted/increased. If set to 0 (default), the no_answer logit is not changed. If a negative number, there is a lower chance of "no_answer" being predicted. If a positive number, there is an increased chance of "no_answer" :param return_no_answer: Whether to include no_answer predictions in the results. :param top_k: The maximum number of answers to return :param top_k_per_candidate: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text). Note that this is not the number of "final answers" you will receive (see `top_k` in FARMReader.predict() or Finder.get_answers() for that) and that FARM includes no_answer in the sorted list of predictions. :param top_k_per_sample: How many answers to extract from each small text passage that the model can process at once (one "candidate doc" is usually split into many smaller "passages"). You usually want a very small value here, as it slows down inference and you don't gain much of quality by having multiple answers from one passage. Note that this is not the number of "final answers" you will receive (see `top_k` in FARMReader.predict() or Finder.get_answers() for that) and that FARM includes no_answer in the sorted list of predictions. :param num_processes: The number of processes for `multiprocessing.Pool`. Set to value of 0 to disable multiprocessing. Set to None to let Inferencer determine optimum number. If you want to debug the Language Model, you might need to disable multiprocessing! :param max_seq_len: Max sequence length of one input text for the model :param doc_stride: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``) :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. :param duplicate_filtering: Answers are filtered based on their position. Both start and end position of the answers are considered. The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal. """ # save init parameters to enable export of component config as YAML self.set_config(model_name_or_path=model_name_or_path, model_version=model_version, context_window_size=context_window_size, batch_size=batch_size, use_gpu=use_gpu, no_ans_boost=no_ans_boost, return_no_answer=return_no_answer, top_k=top_k, top_k_per_candidate=top_k_per_candidate, top_k_per_sample=top_k_per_sample, num_processes=num_processes, max_seq_len=max_seq_len, doc_stride=doc_stride, progress_bar=progress_bar, duplicate_filtering=duplicate_filtering, use_confidence_scores=use_confidence_scores) self.return_no_answers = return_no_answer self.top_k = top_k self.top_k_per_candidate = top_k_per_candidate self.inferencer = QAInferencer.load(model_name_or_path, batch_size=batch_size, gpu=use_gpu, task_type="question_answering", max_seq_len=max_seq_len, doc_stride=doc_stride, num_processes=num_processes, revision=model_version, disable_tqdm=not progress_bar, strict=False) self.inferencer.model.prediction_heads[ 0].context_window_size = context_window_size self.inferencer.model.prediction_heads[0].no_ans_boost = no_ans_boost self.inferencer.model.prediction_heads[ 0].n_best = top_k_per_candidate + 1 # including possible no_answer try: self.inferencer.model.prediction_heads[ 0].n_best_per_sample = top_k_per_sample except: logger.warning( "Could not set `top_k_per_sample` in FARM. Please update FARM version." ) try: self.inferencer.model.prediction_heads[ 0].duplicate_filtering = duplicate_filtering except: logger.warning( "Could not set `duplicate_filtering` in FARM. Please update FARM version." ) self.max_seq_len = max_seq_len self.use_gpu = use_gpu self.progress_bar = progress_bar self.device, _ = initialize_device_settings(use_cuda=self.use_gpu) self.use_confidence_scores = use_confidence_scores
def question_answering_confidence(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = False batch_size = 80 data_dir = Path("../data/squad20") # We use the same file for dev and test set only for demo purposes dev_filename = "dev-v2.0.json" test_filename = "dev-v2.0.json" accuracy_at = 3 # accuracy at n is useful for answers inside long documents # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=dev_filename, test_filename=test_filename, data_dir=data_dir, doc_stride=192, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Load pre-trained question-answering model model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering") model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Number of predictions the model will make per Question. # The multiple predictions are used for evaluating top n recall. model.prediction_heads[0].n_best = accuracy_at # 5. The calibration of model confidence scores sets one parameter, which is called temperature and can be accessed through the prediction_head. # This temperature is applied to each logit in the forward pass, where each logit is divided by the temperature. # A softmax function is applied to the logits afterward to get confidence scores in the range [0,1]. # A temperature larger than 1 decreases the model’s confidence scores. logger.info(f"Parameter used for temperature scaling of model confidence scores: {model.prediction_heads[0].temperature_for_confidence}") # 6a. We can either manually set the temperature (default value is 1.0)... model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device)) # 6b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling. # It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter. # During the calibration, this parameter is automatically set internally as an attribute of the prediction head. evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device ) result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True) # evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev"))) # 7. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy evaluator_test = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0] logger.info("Grouping predictions by confidence score and calculating metrics for each bin.") em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10) for bin_number in range(10): logger.info(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}") # 8. Hooray! You have a model with calibrated confidence scores. # Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head. save_dir = Path("../saved_models/qa-confidence-tutorial") model.save(save_dir) processor.save(save_dir) # 9. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough # To this end, load the stored model, which will automatically load the stored temperature parameter. # The confidence scores are automatically adjusted based on this temperature parameter. # For each prediction, we can check the model's confidence and decide whether to output the prediction or not. inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True) logger.info(f"Loaded model with stored temperature: {inferencer.model.prediction_heads[0].temperature_for_confidence}") QA_input = [ { "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0] if result.prediction[0].confidence > 0.9: print(result.prediction[0].answer) else: print("The confidence is not high enough to give an answer.")
from farm.infer import QAInferencer from farm.data_handler.inputs import QAInput, Question nlp = QAInferencer.load("deepset/roberta-base-squad2", task_type="question_answering", batch_size=16, num_processes=0) input = QAInput(doc_text="My name is Lucas and I live on Mars.", questions=Question(text="Who lives on Mars?", uid="your-id")) res = nlp.inference_from_objects([input], return_json=False)[0] # High level attributes for your query print(res.question) print(res.context) print(res.no_answer_gap) # ... # Attributes for individual predictions (= answers) pred = res.prediction[0] print(pred.answer) print(pred.answer_type) print(pred.answer_support) print(pred.offset_answer_start) print(pred.offset_answer_end)