def _create_examples(self, input_data, set_type): examples = [] for entry in tqdm(input_data): title = entry["title"] for paragraph in entry["paragraphs"]: context_text = paragraph["context"] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position_character = None answer_text = None answers = [] is_impossible = qa.get("is_impossible", False) if not is_impossible: answer = qa["answers"][0] answers = qa["answers"] answer_text = answer["text"] start_position_character = answer["answer_start"] example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, title=title, is_impossible=is_impossible, answers=answers, ) examples.append(example) return examples
def convert_to_example(question, paragraphs): qas_id = QUESTION_ID question_text = question answer_text = None title = None is_impossible = False temp_example = korquadExample( qas_id=qas_id, question_text=question_text, answer_text=answer_text, title=title, is_impossible=is_impossible, ) for paragraph in paragraphs: example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=paragraph, answer_text=answer_text, start_position_character=None, title=title, is_impossible=is_impossible, answers=[], ) temp_example.add_SquadExample(example) return [temp_example]
def create_squad_examples(data_path): with open(data_path, "r", encoding="utf-8") as f: input_data = json.load(f)["data"] examples = [] for entry in tqdm(input_data): title = entry["title"] for paragraph in entry["paragraphs"]: context_text = paragraph["context"] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position_character = None answer_text = None answers = [] is_impossible = qa.get("is_impossible", False) if not is_impossible: answer = qa["answers"][0] answer_text = answer["text"] start_position_character = answer["answer_start"] answers = qa["answers"] example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, title=title, is_impossible=is_impossible, answers=answers, ) examples.append(example) return examples
def convert_to_korquad_example(entry, is_training): title = entry["title"] html = entry["context"] qas = entry["qas"] ## 답변길이가 긴 것들은 BERT계열의 모델의 성능만 낮추는 결과를 초래함. 따라서 제한한다. modified_qas = converter.get_qas_by_len(qas) if len(modified_qas) == 0: return [] modified_paragraphs = converter.convert_to_squad_format(html, modified_qas) temp_examples = {} for modified_paragraph in modified_paragraphs: context_text = modified_paragraph["context"] for qa in modified_paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position_character = None answer_text = None answers = [] if "is_impossible" in qa: is_impossible = qa["is_impossible"] else: is_impossible = False if not is_impossible: if is_training: answer = qa["answers"][0] answer_text = answer["text"] start_position_character = answer["answer_start"] else: answers = qa["answers"] if qas_id not in temp_examples: temp_examples[qas_id] = korquadExample( qas_id=qas_id, question_text=question_text, answer_text=answer_text, title=title, is_impossible=is_impossible, ) example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, title=title, is_impossible=is_impossible, answers=answers, ) temp_examples[qas_id].add_SquadExample(example) return [example for qas_id, example in temp_examples.items()]
def get(self, episode_idx, entry_idx=None): """ Get a specific example from the dataset. """ ex = self.episodes[episode_idx][entry_idx] is_training = self.datatype == "train" qas_id = str(episode_idx) + "_" + str(entry_idx) question_text = ex['text'] answer_text = ex['labels'][0] start_position_character = None is_impossible = answer_text == NO_ANSWER_REPLY # if not is_impossible: answers = ex['labels'] if not is_training: start_position_character = int(ex["answer_starts"].split('|')[0]) else: start_position_character = int(ex["answer_starts"]) char_start_end = (start_position_character, start_position_character + len(answer_text)) # else: # answers = [NO_ANSWER_REPLY] # char_start_end = (-1, -1) squad_example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=self.episodes[episode_idx][0]['context'], answer_text=answer_text, start_position_character=start_position_character, title=ex['title'], is_impossible=is_impossible, answers=answers, ) action = { 'id': 'squad', 'turn_id': ex['turn_id'], 'qas_id': qas_id, 'labels': answers, 'context': ex['context'], 'squad_example': squad_example, 'single_label_text': answer_text, 'episode_done': ex['episode_done'], 'is_impossible': is_impossible, 'followup': ex['followup'], 'yesno': ex['yesno'], 'text': question_text, 'no_answer_reply': NO_ANSWER_REPLY, 'background': ex['background'], 'section_title': ex['section_title'], 'title': ex['title'], 'character_start_end': char_start_end } return action
def get(self, episode_idx, entry_idx=None): is_training = self.datatype == "train" article_idx, paragraph_idx, qa_idx = self.examples[episode_idx] article = self.squad[article_idx] paragraph = article['paragraphs'][paragraph_idx] context_text = paragraph["context"] qa = paragraph["qas"][qa_idx] qas_id = qa["id"] question_text = qa["question"] start_position_character = None answer_text = None answers = [] is_impossible = qa.get("is_impossible", False) if not is_impossible: answer = qa["answers"][0] answer_text = answer["text"] start_position_character = answer["answer_start"] answers = [qa['text'] for qa in qa["answers"]] else: answers = [NO_ANSWER_REPLY] squad_example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, title="unknown title", is_impossible=is_impossible, answers=answers, ) action = { 'id': 'squad', 'qas_id': qas_id, 'context': context_text, 'labels': answers, 'squad_example': squad_example, 'single_label_text': answer_text, 'episode_done': True, 'is_impossible': is_impossible, 'no_answer_reply': NO_ANSWER_REPLY } return action
def _create_examples(input_data, set_type): is_training = set_type == "train" examples = [] for entry in tqdm(input_data): title = entry["title"] for paragraph in entry["paragraphs"]: # context_text = paragraph["context"] for qa in paragraph["qas"]: context_text = qa["context_sent"] qas_id = qa["id"] question_text = qa["question"] start_position_character = None answer_text = None answers = [] if "is_impossible" in qa: is_impossible = qa["is_impossible"] else: is_impossible = False if is_training: if qa["answers"]: answer = qa["answers"][0] else: answer = qa["plausible_answers"][0] answer_text = answer["text"] start_position_character = context_text.find(answer_text) else: answers = qa["answers"] example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, title=title, is_impossible=is_impossible, answers=answers, ) examples.append(example) return examples
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isfile( os.path.join(self.parent.config.output_dir, "pytorch_model.bin")): raise ModelNotTrained("Train model before prediction.") self.model = AutoModelForQuestionAnswering.from_pretrained( self.parent.config.output_dir) # , force_download=True) self.model.to(self.parent.config.device) self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.output_dir, do_lower_case=self.parent.config.do_lower_case, ) async for record in sources.records(): example = SquadExample( qas_id=record.key, question_text=record.feature("question"), context_text=record.feature("context"), answer_text=record.feature("answer_text"), start_position_character=record.feature("start_pos_char"), title=record.feature("title"), is_impossible=record.feature("is_impossible"), answers=record.feature("answers"), ) features, dataset = squad_convert_examples_to_features( examples=[example], tokenizer=self.tokenizer, max_seq_length=self.parent.config.max_seq_length, doc_stride=self.parent.config.doc_stride, max_query_length=self.parent.config.max_query_length, is_training=False, return_dataset="pt", ) prediction = await self._custom_accuracy([example], features, dataset) record.predicted("Answer", prediction, "Nan") yield record
async def _preprocess_data(self, sources: Sources): all_examples = [] all_sources = sources.with_features([ "question", "context", "answer_text", "start_pos_char", "title", "is_impossible", "answers", ]) async for record in all_sources: example = SquadExample( qas_id=record.key, question_text=record.feature("question"), context_text=record.feature("context"), answer_text=record.feature("answer_text"), start_position_character=record.feature("start_pos_char"), title=record.feature("title"), is_impossible=record.feature("is_impossible"), answers=record.feature("answers"), ) all_examples.append(example) return all_examples
def find_answer(self, question, context, n_best_size=20, max_answer_length=30, full_sentence=False): # heavily inspired by "https://github.com/huggingface/transformers/blob/v2.3.0/examples/run_squad.py#L212-L317" example_id = '55555' example = SquadExample(example_id, question, context, None, None, None) features, dataset = squad_convert_examples_to_features( [example], self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length, False, return_dataset='pt') sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=1) all_results = [] for batch in dataloader: self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.model_type in {"xlm", "roberta", "distilbert"}: del inputs["token_type_ids"] example_index = batch[3] # XLNet and XLM use more arguments for their predictions if self.model_type in {"xlnet", "xlm"}: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = self.model(**inputs) output = [o.detach().cpu().tolist() for o in outputs] unique_id = int(features[example_index].unique_id) # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] squad_result = SquadResult( unique_id, start_logits[0], end_logits[0], start_top_index=start_top_index[0], end_top_index=end_top_index[0], cls_logits=cls_logits[0], ) else: start_logits, end_logits = output squad_result = SquadResult(unique_id, start_logits[0], end_logits[0]) all_results.append(squad_result) # XLNet and XLM use a more complex post-processing procedure if self.model_type in {"xlnet", "xlm"}: if hasattr(model, "config"): start_n_top = self.model.config.start_n_top end_n_top = self.model.config.end_n_top else: start_n_top = self.model.module.config.start_n_top end_n_top = self.model.module.config.end_n_top predictions = compute_predictions_log_probs( [example], features, all_results, n_best_size, max_answer_length, '/tmp/pred.out', '/tmp/nbest.out', '/tmp/null.out', start_n_top, end_n_top, self.version_2_with_negative, tokenizer, self.verbose, ) else: predictions = compute_predictions_logits( [example], features, all_results, n_best_size, max_answer_length, self.do_lower_case, '/tmp/pred.out', '/tmp/nbest.out', '/tmp/null.out', self.verbose, self.version_2_with_negative, self.null_score_diff_threshold, ) prediction = predictions[example_id] logger.debug(f'found prediction: "{prediction}"') # empty prediction indicates unknown answer if not prediction: logger.debug('empty prediction') return None if full_sentence: doc = self.nlp(context) for sent in doc.sents: if prediction in sent.text: prediction = sent.text break return prediction
def run_prediction_multi(question_texts, context_texts): """ Modified from run_squad.py to only produce predicted answer given the question and context. This function will produce multiple answers by splitting the context into paragraphs Input: 1. List of questions 2. List of Context Output: 1. Predicted answer """ examples = [] for i, question_text in enumerate(question_texts): for j, context_text in enumerate(context_texts): example = SquadExample( qas_id=str(i) + str(j), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = "predictions.json" output_nbest_file = "nbest_predictions.json" output_null_log_odds_file = "null_predictions.json" predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative null_score_diff_threshold, tokenizer, ) return predictions
def answer_question(self, ranked_examples): squad_examples = [SquadExample( qas_id=str(x['id']), question_text=x['question'], context_text=x['document'], answer_text=None, start_position_character=None, title='', answers=[], ) for x in ranked_examples] squad_features, squad_dataset = squad_convert_examples_to_features( examples=squad_examples, tokenizer=self.tokenizer, max_seq_length=512, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=cpu_count(), ) eval_batch_size = self.per_gpu_eval_batch_size * max(1, self.n_gpu) eval_sampler = SequentialSampler(squad_dataset) eval_dataloader = DataLoader(squad_dataset, sampler=eval_sampler, batch_size=eval_batch_size) # multi-gpu evaluate if self.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval! logger.info("***** Running evaluation of QA *****") logger.info(" Num examples = %d", len(squad_dataset)) logger.info(" Batch size = %d", eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating reader"): self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if self.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm self.models if hasattr(self.model, "config") and hasattr(self.model.config, "lang2id"): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * self.lang_id).to(self.device)} ) outputs = self.model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = squad_features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(self.model_name_or_path, "predictions.json") output_nbest_file = os.path.join(self.model_name_or_path, "nbest_predictions.json") if True: output_null_log_odds_file = os.path.join(self.model_name_or_path, "null_odds.json") else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if self.model_type in ["xlnet", "xlm"]: start_n_top = self.model.config.start_n_top if hasattr(self.model, "config") else self.model.module.config.start_n_top end_n_top = self.model.config.end_n_top if hasattr(self.model, "config") else self.model.module.config.end_n_top predictions = compute_predictions_log_probs( squad_examples, squad_features, all_results, n_best_size=self.n_best_size, max_answer_length=self.max_answer_length, output_prediction_file=output_prediction_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=output_null_log_odds_file, start_n_top=start_n_top, end_n_top=end_n_top, version_2_with_negative=True, tokenizer=self.okenizer, verbose_logging=True, ) else: predictions = compute_predictions_logits( squad_examples, squad_features, all_results, n_best_size=self.n_best_size, max_answer_length=self.max_answer_length, do_lower_case=True, output_prediction_file=output_prediction_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=output_null_log_odds_file, verbose_logging=True, version_2_with_negative=True, null_score_diff_threshold=0.0, tokenizer=self.tokenizer, ) logger.info('predictions: {}'.format(predictions)) with open(output_nbest_file) as f: output_nbest = json.load(f) return output_nbest
def run_prediction(question_texts, context_text, model_path): ### Setting hyperparameters max_seq_length = 512 doc_stride = 256 n_best_size = 1 max_query_length = 64 max_answer_length = 512 do_lower_case = False null_score_diff_threshold = 0.0 # model_name_or_path = "../cuad-models/roberta-base/" def to_list(tensor): return tensor.detach().cpu().tolist() config_class, model_class, tokenizer_class = ( AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer) config = config_class.from_pretrained(model_path) tokenizer = tokenizer_class.from_pretrained( model_path, do_lower_case=True, use_fast=False) model = model_class.from_pretrained(model_path, config=config) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) processor = SquadV2Processor() examples = [] for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs.to_tuple()] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) final_predictions = compute_predictions_logits( all_examples=examples, all_features=features, all_results=all_results, n_best_size=n_best_size, max_answer_length=max_answer_length, do_lower_case=do_lower_case, output_prediction_file=None, output_nbest_file=None, output_null_log_odds_file=None, verbose_logging=False, version_2_with_negative=True, null_score_diff_threshold=null_score_diff_threshold, tokenizer=tokenizer ) return final_predictions
def answergen_albert(question_texts, context_text): model_name_or_path = "ktrapeznikov/albert-xlarge-v2-squad-v2" output_dir = "" # Config n_best_size = 1 max_answer_length = 30 do_lower_case = True null_score_diff_threshold = 0.0 def to_list(tensor): return tensor.detach().cpu().tolist() # Setup model config_class, model_class, tokenizer_class = (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer) config = config_class.from_pretrained( "ktrapeznikov/albert-xlarge-v2-squad-v2") tokenizer = tokenizer_class.from_pretrained( "ktrapeznikov/albert-xlarge-v2-squad-v2", do_lower_case=True) model = model_class.from_pretrained( "ktrapeznikov/albert-xlarge-v2-squad-v2", config=config) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) model.to(device) #processor = SquadV2Processor() """Setup function to compute predictions""" examples = [] print(question_texts) for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) print(examples) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = "predictions.json" output_nbest_file = "nbest_predictions.json" output_null_log_odds_file = "null_predictions.json" predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative null_score_diff_threshold, tokenizer, ) return predictions
def run_prediction(model, question_texts, context_text): """Setup function to compute predictions""" processor = SquadV2Processor() config = model.model.config tokenizer = model.tokenizer examples = [] model = model.model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if not os.path.exists("predictions"): os.mkdir("predictions") output_prediction_file = "predictions/predictions.json" output_nbest_file = "predictions/nbest_predictions.json" output_null_log_odds_file = "predictions/null_predictions.json" predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative null_score_diff_threshold, tokenizer, ) return predictions
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" is_bioasq=True # for BioASQ with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True if is_bioasq: paragraph_text.replace('/',' ') # need review for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] answer = None is_impossible = False if is_training: assert (qa["is_impossible"] == True) != (qa["answers"] == "yes") assert qa["answers"] in ["yes", "no"] # answer = 1 if qa["answers"] == 'yes' else 0 is_impossible = qa["is_impossible"] example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=paragraph_text, answer_text='', start_position_character=None, title='', answers=[], is_impossible=is_impossible, ) examples.append(example) # target_cnt = 500 if is_training: pos_cnt = sum([1 for example in examples if example.is_impossible == False]) neg_cnt = sum([1 for example in examples if example.is_impossible == True]) target_cnt = min(pos_cnt,neg_cnt) print() print('Imbalance btw {} vs {}'.format(pos_cnt, neg_cnt)) random.shuffle(examples) new_examples = [] new_pos_cnt = 0 new_neg_cnt = 0 for example in examples: if example.is_impossible == False and new_pos_cnt >= target_cnt: continue if example.is_impossible == True and new_neg_cnt >= target_cnt: continue else: new_examples.append(example) new_pos_cnt += (1 if example.is_impossible == False else 0) new_neg_cnt += (1 if example.is_impossible == True else 0) pos_cnt = sum([1 for example in new_examples if example.is_impossible == False]) neg_cnt = sum([1 for example in new_examples if example.is_impossible == True]) random.shuffle(new_examples) print('Balanced as {} vs {}'.format(pos_cnt, neg_cnt)) print('Sample: {}'.format(new_examples[0])) return new_examples else: return examples
def infer(self, context_text, question_texts): """Setup function to compute predictions""" examples = [] for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = self.model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [self._to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = "predictions.json" output_nbest_file = "nbest_predictions.json" output_null_log_odds_file = "null_predictions.json" predictions = compute_predictions_logits( examples, features, all_results, self.n_best_size, self.max_answer_length, self.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative self.null_score_diff_threshold, self.tokenizer, ) return predictions
def run_prediction(question_texts, context_text): """Setup function to compute predictions""" examples = [] for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=4, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = "predictions.json" output_nbest_file = "nbest_predictions.json" output_null_log_odds_file = "null_predictions.json" predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative null_score_diff_threshold, tokenizer, ) return predictions # context = "New Zealand (Māori: Aotearoa) is a sovereign island country in the southwestern Pacific Ocean. It has a total land area of 268,000 square kilometres (103,500 sq mi), and a population of 4.9 million. New Zealand's capital city is Wellington, and its most populous city is Auckland." # questions = ["How many people live in New Zealand?", # "What's the largest city?"] # # # Run method # predictions = run_prediction(questions, context) # # # Print results # for key in predictions.keys(): # print(predictions[key])