class CoQAPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language="en_core_web_sm") def predict(self, jsonline: str) -> JsonDict: return self.predict_json(json.loads(jsonline)) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects json that looks like the original data file. """ file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json["data"] logger.info("Reading the dataset...") paragraph_json = json_dict # for paragraph_json in dataset: paragraph = paragraph_json["story"] tokenized_paragraph = self._tokenizer.split_words(paragraph) questions = paragraph_json["questions"] golden_answers = paragraph_json["answers"] self.handle_unknown_answers(golden_answers, len(paragraph)) metadata = {} paragraph_id = paragraph_json["id"] metadata["instance_id"] = [str(paragraph_id) + "_" + str(ques["turn_id"]) for ques in questions] if (len(metadata["instance_id"]) > 15): metadata["instance_id"] = metadata["instance_id"][:15] question_text_list = [ques["input_text"].strip().replace("\n", "") for ques in questions] if (len(question_text_list) > 15): question_text_list = question_text_list[:15] answer_texts_list = [[answer["span_text"]] for answer in golden_answers] if (len(answer_texts_list) > 15): answer_texts_list = answer_texts_list[:15] metadata["question"] = question_text_list metadata["answer_texts_list"] = answer_texts_list span_start_list = [[answer["span_start"]] for answer in golden_answers] span_end_list = [[answer["span_end"]] for answer in golden_answers] if (len(span_end_list) > 15): span_end_list = span_end_list[:15] # for st_list, an_list in zip(span_starts_list, answer_texts_list): # span_ends = [start + len(answer) for start, answer in zip(st_list, an_list)] # span_ends_list.append(span_ends) yesno_list = [str("x") for ques in questions][:15] followup_list = [str("n") for ques in questions][:15] instance = self._dataset_reader.text_to_instance(question_text_list, paragraph, span_start_list, span_end_list, tokenized_paragraph, yesno_list, followup_list, metadata) return instance def text_to_instance(self, # type: ignore question_text_list: List[str], passage_text: str, start_span_list: List[List[int]] = None, end_span_list: List[List[int]] = None, passage_tokens: List[Token] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None) -> Instance: # pylint: disable=arguments-differ # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we"ll actually use for supervision. answer_token_span_list = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for start_list, end_list in zip(start_span_list, end_span_list): token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in zip(start_list, end_list): (span_start, span_end), error = my_util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) answer_token_span_list.append(token_spans) question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list] # Map answer texts to "CANNOTANSWER" if more than half of them marked as so. additional_metadata["answer_texts_list"] = [util.handle_cannot(ans_list) for ans_list \ in additional_metadata["answer_texts_list"]] return util.make_reading_comprehension_instance_quac(question_list_tokens, passage_tokens, self._token_indexers, passage_text, answer_token_span_list, yesno_list, followup_list, additional_metadata, self._num_context_answers) def handle_unknown_answers(self, answers, plen): for ans in answers: if ans["span_start"] < 0: ans["span_start"] = 0 if ans["span_end"] < 0: ans["span_end"] = plen - 1
class SentenceTaggerPredictor(Predictor): """ Predictor for any model that takes in a sentence and returns a single set of tags for it. In particular, it can be used with the :class:`~allennlp.models.crf_tagger.CrfTagger` model and also the :class:`~allennlp.models.simple_tagger.SimpleTagger` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm") -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter( split_on_spaces=True) #, pos_tags=True, split_on_spaces=True) def predict_sentence(self, sentence) -> JsonDict: return self.predict_json({"sentence": sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"words"`` to the output. """ if 'sentence' in json_dict: sentence = json_dict["sentence"] tokens = self._tokenizer.tokenize(sentence) if 'tokens' in json_dict: tokens = json_dict["tokens"] return self._dataset_reader.text_to_instance(tokens) @overrides def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: """ This function currently only handles BIOUL tags. Imagine an NER model predicts three named entities (each one with potentially multiple tokens). For each individual entity, we create a new Instance that has the label set to only that entity and the rest of the tokens are labeled as outside. We then return a list of those Instances. For example: Mary went to Seattle to visit Microsoft Research U-Per O O U-Loc O O B-Org L-Org We create three instances. Mary went to Seattle to visit Microsoft Research U-Per O O O O O O O Mary went to Seattle to visit Microsoft Research O O O U-LOC O O O O Mary went to Seattle to visit Microsoft Research O O O O O O B-Org L-Org """ predicted_tags = outputs["tags"] predicted_spans = [] i = 0 while i < len(predicted_tags): tag = predicted_tags[i] # if its a U, add it to the list if tag[0] == "U": current_tags = [ t if idx == i else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) # if its a B, keep going until you hit an L. elif tag[0] == "B": begin_idx = i while tag[0] != "L": i += 1 tag = predicted_tags[i] end_idx = i current_tags = [ t if begin_idx <= idx <= end_idx else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) i += 1 # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: new_instance = deepcopy(instance) text_field: TextField = instance["tokens"] # type: ignore new_instance.add_field("tags", SequenceLabelField(labels, text_field), self._model.vocab) instances.append(new_instance) instances.reverse( ) # NER tags are in the opposite order as desired for the interpret UI return instances
class SimpleSeq2SeqPredictor(Predictor): """ Predictor for the :class:`~allennlp.models.encoder_decoder.simple_seq2seq` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language="en_core_web_sm") @overrides def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) del outputs["logits"] del outputs["class_probabilities"] return sanitize(outputs) def predict(self, source: str) -> JsonDict: pred_json = self.predict_json({"source": source}) return pred_json @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"source": "..."}``. """ # print(json_dict) paragraph_json = json_dict all_questions = paragraph_json['questions'] golden_answers = paragraph_json['answers'] paragraph_id = paragraph_json['id'] # READ THE BIDAF++ OUTPUTS bidafplus_output_filename = os.path.join( os.path.dirname(os.path.realpath(file_path)), 'bidafplus_output_formatted.json') with open(bidafplus_output_filename) as bidafplus_outputs: best_span_str_json = json.load(bidafplus_outputs) best_span_str = best_span_str_json['data'] # extractive outputs from BIDAF++ best_span_str_list = best_span_str[paragraph_id] # metadata metadata = {} metadata['paragraph_id'] = paragraph_id metadata['questions'] = [ ques["input_text"].strip().replace("\n", "") for ques in all_questions ][:15] questions_list = [ ques["input_text"].strip().replace("\n", "") for ques in all_questions ][:15] golden_rationale_list = [ answer['span_text'].strip().replace("\n", "") for answer in golden_answers ][:15] answers_list = [ answer['input_text'].strip().replace("\n", "") for answer in golden_answers ][:15] bidafplus_rationale_list = [ answer['answer_text'].strip().replace("\n", "") for answer in best_span_str_list ][:15] ques_rat_list = [ ' '.join([ bidafplus_rationale_list[i], self.question_tag, questions_list[i] ]) for i in range(len(questions_list)) ] for i in range(len(questions_list)): yield self.text_to_instance(ques_rat_list[i], answers_list[i], paragraph_id, i) # yield self.text_to_instance(rationale_list[i], answers_list[i]) def text_to_instance(self, source_string: str, target_string: str = None, paragraph_id: str = None, turn_id: int = 0) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_source = self._tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._token_indexers) if target_string is not None: tokenized_target = self._tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._token_indexers) return Instance({"source_tokens": source_field}) else: return Instance({"source_tokens": source_field})