Exemplos de SpacyWordSplitter.tokenize em Python, exemplos de allennlp.data.tokenizers.word_splitter.SpacyWordSplitter.tokenize em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: bidafplus_predictor.py Projeto: mudit1990/CoQA-Challenge

class CoQAPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language="en_core_web_sm")

    def predict(self, jsonline: str) -> JsonDict:
        return self.predict_json(json.loads(jsonline))

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects json that looks like the original data file.
        """
        file_path = cached_path(file_path)
        logger.info("Reading file at %s", file_path)

        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json["data"]

        logger.info("Reading the dataset...")

        paragraph_json = json_dict

        # for paragraph_json in dataset:
        paragraph = paragraph_json["story"]
        tokenized_paragraph = self._tokenizer.split_words(paragraph)
        questions = paragraph_json["questions"]
        golden_answers = paragraph_json["answers"]
        self.handle_unknown_answers(golden_answers, len(paragraph))
        metadata = {}
        paragraph_id = paragraph_json["id"]
        metadata["instance_id"] = [str(paragraph_id) + "_" + str(ques["turn_id"]) for ques in questions]

        if (len(metadata["instance_id"]) > 15):
            metadata["instance_id"] = metadata["instance_id"][:15]

        question_text_list = [ques["input_text"].strip().replace("\n", "") for ques in questions]
        if (len(question_text_list) > 15):
            question_text_list = question_text_list[:15]

        answer_texts_list = [[answer["span_text"]] for answer in golden_answers]
        if (len(answer_texts_list) > 15):
            answer_texts_list = answer_texts_list[:15]

        metadata["question"] = question_text_list
        metadata["answer_texts_list"] = answer_texts_list

        span_start_list = [[answer["span_start"]] for answer in golden_answers]
        span_end_list = [[answer["span_end"]] for answer in golden_answers]
        if (len(span_end_list) > 15):
            span_end_list = span_end_list[:15]

        # for st_list, an_list in zip(span_starts_list, answer_texts_list):
        #     span_ends = [start + len(answer) for start, answer in zip(st_list, an_list)]
        #     span_ends_list.append(span_ends)

        yesno_list = [str("x") for ques in questions][:15]
        followup_list = [str("n") for ques in questions][:15]
        instance = self._dataset_reader.text_to_instance(question_text_list,
                                                         paragraph,
                                                         span_start_list,
                                                         span_end_list,
                                                         tokenized_paragraph,
                                                         yesno_list,
                                                         followup_list,
                                                         metadata)
        return instance

    def text_to_instance(self,  # type: ignore
                         question_text_list: List[str],
                         passage_text: str,
                         start_span_list: List[List[int]] = None,
                         end_span_list: List[List[int]] = None,
                         passage_tokens: List[Token] = None,
                         yesno_list: List[int] = None,
                         followup_list: List[int] = None,
                         additional_metadata: Dict[str, Any] = None) -> Instance:
        # pylint: disable=arguments-differ
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we"ll actually use for supervision.
        answer_token_span_list = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        for start_list, end_list in zip(start_span_list, end_span_list):
            token_spans: List[Tuple[int, int]] = []
            for char_span_start, char_span_end in zip(start_list, end_list):
                (span_start, span_end), error = my_util.char_span_to_token_span(passage_offsets,
                                                                             (char_span_start, char_span_end))
                if error:
                    logger.debug("Passage: %s", passage_text)
                    logger.debug("Passage tokens: %s", passage_tokens)
                    logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                    logger.debug("Token span: (%d, %d)", span_start, span_end)
                    logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                    logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
                token_spans.append((span_start, span_end))
            answer_token_span_list.append(token_spans)
        question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list]
        # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
        additional_metadata["answer_texts_list"] = [util.handle_cannot(ans_list) for ans_list \
                                                    in additional_metadata["answer_texts_list"]]
        return util.make_reading_comprehension_instance_quac(question_list_tokens,
                                                             passage_tokens,
                                                             self._token_indexers,
                                                             passage_text,
                                                             answer_token_span_list,
                                                             yesno_list,
                                                             followup_list,
                                                             additional_metadata,
                                                             self._num_context_answers)

    def handle_unknown_answers(self, answers, plen):
        for ans in answers:
            if ans["span_start"] < 0:
                ans["span_start"] = 0
            if ans["span_end"] < 0:
                ans["span_end"] = plen - 1

Exemplo n.º 2

0

Exibir arquivo

Arquivo: allennlp_predictor.py Projeto: c0ntradicti0n/LayoutEagle

class SentenceTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = "en_core_web_sm") -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(
            split_on_spaces=True)  #, pos_tags=True, split_on_spaces=True)

    def predict_sentence(self, sentence) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        if 'sentence' in json_dict:
            sentence = json_dict["sentence"]
            tokens = self._tokenizer.tokenize(sentence)
        if 'tokens' in json_dict:
            tokens = json_dict["tokens"]
        return self._dataset_reader.text_to_instance(tokens)

    @overrides
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        """
        This function currently only handles BIOUL tags.

        Imagine an NER model predicts three named entities (each one with potentially
        multiple tokens). For each individual entity, we create a new Instance that has
        the label set to only that entity and the rest of the tokens are labeled as outside.
        We then return a list of those Instances.

        For example:
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O   U-Loc  O   O     B-Org     L-Org

        We create three instances.
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O    O     O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O   U-LOC  O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O    O     O   O     B-Org     L-Org
        """
        predicted_tags = outputs["tags"]
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == "U":
                current_tags = [
                    t if idx == i else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == "B":
                begin_idx = i
                while tag[0] != "L":
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [
                    t if begin_idx <= idx <= end_idx else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = deepcopy(instance)
            text_field: TextField = instance["tokens"]  # type: ignore
            new_instance.add_field("tags",
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            instances.append(new_instance)
        instances.reverse(
        )  # NER tags are in the opposite order as desired for the interpret UI

        return instances

Exemplo n.º 3

0

Exibir arquivo

class SimpleSeq2SeqPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.encoder_decoder.simple_seq2seq` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language="en_core_web_sm")

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)
        del outputs["logits"]
        del outputs["class_probabilities"]
        return sanitize(outputs)

    def predict(self, source: str) -> JsonDict:
        pred_json = self.predict_json({"source": source})
        return pred_json

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"source": "..."}``.
        """
        # print(json_dict)
        paragraph_json = json_dict
        all_questions = paragraph_json['questions']
        golden_answers = paragraph_json['answers']
        paragraph_id = paragraph_json['id']

        # READ THE BIDAF++ OUTPUTS
        bidafplus_output_filename = os.path.join(
            os.path.dirname(os.path.realpath(file_path)),
            'bidafplus_output_formatted.json')
        with open(bidafplus_output_filename) as bidafplus_outputs:
            best_span_str_json = json.load(bidafplus_outputs)
            best_span_str = best_span_str_json['data']

        # extractive outputs from BIDAF++
        best_span_str_list = best_span_str[paragraph_id]

        # metadata
        metadata = {}
        metadata['paragraph_id'] = paragraph_id
        metadata['questions'] = [
            ques["input_text"].strip().replace("\n", "")
            for ques in all_questions
        ][:15]

        questions_list = [
            ques["input_text"].strip().replace("\n", "")
            for ques in all_questions
        ][:15]
        golden_rationale_list = [
            answer['span_text'].strip().replace("\n", "")
            for answer in golden_answers
        ][:15]
        answers_list = [
            answer['input_text'].strip().replace("\n", "")
            for answer in golden_answers
        ][:15]
        bidafplus_rationale_list = [
            answer['answer_text'].strip().replace("\n", "")
            for answer in best_span_str_list
        ][:15]
        ques_rat_list = [
            ' '.join([
                bidafplus_rationale_list[i], self.question_tag,
                questions_list[i]
            ]) for i in range(len(questions_list))
        ]
        for i in range(len(questions_list)):
            yield self.text_to_instance(ques_rat_list[i], answers_list[i],
                                        paragraph_id, i)
            # yield self.text_to_instance(rationale_list[i], answers_list[i])

    def text_to_instance(self,
                         source_string: str,
                         target_string: str = None,
                         paragraph_id: str = None,
                         turn_id: int = 0) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_source = self._tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._token_indexers)
        if target_string is not None:
            tokenized_target = self._tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._token_indexers)
            return Instance({"source_tokens": source_field})
        else:
            return Instance({"source_tokens": source_field})