Пример #1
0
class TestDepLabelIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestDepLabelIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(parse=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = DepLabelIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1,
                                         "det": 1, "NONE": 2, "attr": 1, "punct": 1}

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
        none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
        indexer = DepLabelIndexer()
        assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]}
        assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}

    def test_padding_functions(self):
        indexer = DepLabelIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = DepLabelIndexer()
        padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
        assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
class TestNerTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestNerTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(ner=True)

    def test_count_vocab_items_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["ner_tags"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}

    def test_token_to_indices_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
        vocab.add_token_to_namespace('ORG', namespace='ner_tags')
        indexer = NerTagIndexer()
        assert indexer.token_to_indices(tokens[1], vocab) == person_index
        assert indexer.token_to_indices(tokens[-1], vocab) == none_index

    def test_padding_functions(self):
        indexer = NerTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = NerTagIndexer()
        padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
        assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
Пример #3
0
class TestPosTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestPosTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(pos_tags=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace('DET', namespace='pos_tags')
        vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
        vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')

        indexer = PosTagIndexer(coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}

    def test_padding_functions(self):
        indexer = PosTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = PosTagIndexer()
        padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
        assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
Пример #4
0
class TestNerTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestNerTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(ner=True)

    def test_count_vocab_items_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["ner_tokens"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}

    def test_tokens_to_indices_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
        vocab.add_token_to_namespace('ORG', namespace='ner_tags')
        indexer = NerTagIndexer(namespace='ner_tags')
        assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [person_index]}
        assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}

    def test_padding_functions(self):
        indexer = NerTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = NerTagIndexer()
        padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
        assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}

    def test_blank_ner_tag(self):
        tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
        for token in tokens:
            token.ent_type_ = ""
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        # spacy uses a empty string to indicate "no NER tag"
        # we convert it to "NONE"
        assert counter["ner_tokens"]["NONE"] == 4
        vocab = Vocabulary(counter)
        none_index = vocab.get_token_index('NONE', 'ner_tokens')
        # should raise no exception
        indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
        assert {"ner": [none_index, none_index, none_index, none_index]} == indices
Пример #5
0
class DialogQAPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language)

    def predict(self, jsonline: str) -> JsonDict:
        """
        Make a dialog-style question answering prediction on the supplied input.
        The supplied input json must contain a list of
        question answer pairs, containing question, answer, yesno, followup, id
        as well as the context (passage).

        Parameters
        ----------
        jsonline: ``str``
            A json line that has the same format as the quac data file.

        Returns
        ----------
        A dictionary that represents the prediction made by the system.  The answer string will be under the
        "best_span_str" key.
        """
        return self.predict_json(json.loads(jsonline))

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects json that looks like the original quac data file.
        """
        paragraph_json = json_dict["paragraphs"][0]
        paragraph = paragraph_json['context']
        tokenized_paragraph = self._tokenizer.split_words(paragraph)
        qas = paragraph_json['qas']
        metadata = {}
        metadata["instance_id"] = [qa['id'] for qa in qas]
        question_text_list = [qa["question"].strip().replace("\n", "") for qa in qas]
        answer_texts_list = [[answer['text'] for answer in qa['answers']] for qa in qas]
        metadata["answer_texts_list"] = answer_texts_list
        metadata["question"] = question_text_list
        span_starts_list = [[answer['answer_start'] for answer in qa['answers']] for qa in qas]
        span_ends_list = []
        for st_list, an_list in zip(span_starts_list, answer_texts_list):
            span_ends = [start + len(answer) for start, answer in zip(st_list, an_list)]
            span_ends_list.append(span_ends)
        yesno_list = [str(qa['yesno']) for qa in qas]
        followup_list = [str(qa['followup']) for qa in qas]
        instance = self._dataset_reader.text_to_instance(question_text_list,
                                                         paragraph,
                                                         span_starts_list,
                                                         span_ends_list,
                                                         tokenized_paragraph,
                                                         yesno_list,
                                                         followup_list,
                                                         metadata)
        return instance
class TestPosTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestPosTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(pos_tags=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}

    def test_token_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        indexer = PosTagIndexer(coarse_tags=True)
        assert indexer.token_to_indices(tokens[1], vocab) == verb_index
        assert indexer.token_to_indices(tokens[-1], vocab) == none_index
        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.token_to_indices(tokens[1], vocab) == cop_index

    def test_padding_functions(self):
        indexer = PosTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = PosTagIndexer()
        padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
        assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
Пример #7
0
class TestSpacyWordSplitter(AllenNlpTestCase):
    def setUp(self):
        super(TestSpacyWordSplitter, self).setUp()
        self.word_splitter = SpacyWordSplitter()

    def test_tokenize_handles_complex_punctuation(self):
        sentence = "this (sentence) has 'crazy' \"punctuation\"."
        expected_tokens = ["this", "(", "sentence", ")", "has", "'", "crazy", "'", '"',
                           "punctuation", '"', "."]
        tokens = self.word_splitter.split_words(sentence)
        token_text = [t.text for t in tokens]
        assert token_text == expected_tokens
        for token in tokens:
            start = token.idx
            end = start + len(token.text)
            assert sentence[start:end] == token.text

    def test_tokenize_handles_contraction(self):
        # note that "would've" is kept together, while "ain't" is not.
        sentence = "it ain't joe's problem; would been yesterday"
        expected_tokens = ["it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been",
                           "yesterday"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_multiple_contraction(self):
        sentence = "wouldn't've"
        expected_tokens = ["would", "n't", "'ve"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_final_apostrophe(self):
        sentence = "the jones' house"
        expected_tokens = ["the", "jones", "'", "house"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_removes_whitespace_tokens(self):
        sentence = "the\n jones'   house  \x0b  55"
        expected_tokens = ["the", "jones", "'", "house", "55"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_special_cases(self):
        # note that the etc. doesn't quite work --- we can special case this if we want.
        sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store"
        expected_tokens = ["Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to", ",",
                           "e.g.", ",", "the", "store"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens
Пример #8
0
class SentenceTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence" : sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance(tokens)
Пример #9
0
 def setUp(self):
     super(TestSpacyWordSplitter, self).setUp()
     self.word_splitter = SpacyWordSplitter()
Пример #10
0
 def normal_tokenizer(x: str):
     return [
         w.text for w in SpacyWordSplitter(language='en_core_web_sm',
                                           pos_tags=False).split_words(x)
         [:max_seq_len]
     ]
    def text_to_instance(
            self,
            para_id: str,
            sentence_texts: List[str],
            participants: List[str],
            states: List[
                List[str]] = None,  # states[i][j] is ith participant at time j
            filename: str = '',
            score: float = None) -> Instance:

        tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(
            pos_tags=True))

        paragraph = " ".join(sentence_texts)

        # Tokenize the sentences
        sentences = [
            tokenizer.tokenize(sentence_text)
            for sentence_text in sentence_texts
        ]

        # Find the verbs
        verb_indexes = [[
            1 if token.pos_ == "VERB" else 0 for token in sentence
        ] for sentence in sentences]

        if states is not None:
            # Actions is (num_participants, num_events)
            actions = [_infer_actions(states_i) for states_i in states]

            tokenized_states = [[
                tokenizer.tokenize(state_ij) for state_ij in states_i
            ] for states_i in states]

            location_spans = [
                _compute_location_spans(states_i, sentences)
                for states_i in tokenized_states
            ]

        # Create indicators for the participants.
        participant_tokens = [
            tokenizer.tokenize(participant) for participant in participants
        ]
        participant_indicators: List[List[List[int]]] = []

        for participant_i_tokens in participant_tokens:
            targets = [
                list(token_group)
                for is_semicolon, token_group in itertools.groupby(
                    participant_i_tokens, lambda t: t.text == ";")
                if not is_semicolon
            ]

            participant_i_indicators: List[List[int]] = []

            for sentence in sentences:
                sentence_indicator = [0 for _ in sentence]

                for target in targets:
                    start = 0
                    while True:
                        span_start, span_end = _find_span(target,
                                                          sentence,
                                                          start,
                                                          target_is_noun=True)
                        if span_start >= 0:
                            for j in range(span_start, span_end + 1):
                                sentence_indicator[j] = 1
                            start = span_start + 1
                        else:
                            break

                participant_i_indicators.append(sentence_indicator)

            participant_indicators.append(participant_i_indicators)

        fields: Dict[str, Field] = {}
        fields["paragraph"] = TextField(tokenizer.tokenize(paragraph),
                                        self._token_indexers)
        fields["participants"] = ListField([
            TextField(tokenizer.tokenize(participant), self._token_indexers)
            for participant in participants
        ])

        # One per sentence
        fields["sentences"] = ListField([
            TextField(sentence, self._token_indexers) for sentence in sentences
        ])

        # One per sentence
        fields["verbs"] = ListField([
            SequenceLabelField(verb_indexes[i],
                               fields["sentences"].field_list[i])
            for i in range(len(sentences))
        ])
        # And also at the paragraph level
        fields["paragraph_verbs"] = SequenceLabelField([
            verb_indicator for verb_indexes_i in verb_indexes
            for verb_indicator in verb_indexes_i
        ], fields["paragraph"])

        if states is not None:
            # Outer ListField is one per participant
            fields["actions"] = ListField([
                # Inner ListField is one per sentence
                ListField([
                    # action is an Enum, so call .value to get an int
                    LabelField(action.value, skip_indexing=True)
                    for action in participant_actions
                ]) for participant_actions in actions
            ])

            # Outer ListField is one per participant
            fields["before_locations"] = ListField([
                # Inner ListField is one per sentence
                ListField([
                    SpanField(start, end, fields["sentences"].field_list[i])
                    for i, ((start, end),
                            _) in enumerate(participant_location_spans)
                ]) for participant_location_spans in location_spans
            ])
            # Outer ListField is one per participant
            fields["after_locations"] = ListField([
                # Inner ListField is one per sentence
                ListField([
                    SpanField(start, end, fields["sentences"].field_list[i])
                    for i, (_, (start,
                                end)) in enumerate(participant_location_spans)
                ]) for participant_location_spans in location_spans
            ])

        # one per participant
        fields["participant_indicators"] = ListField([
            # one per sentence
            ListField([
                SequenceLabelField(sentence_indicator,
                                   fields["sentences"].field_list[i]) for i,
                sentence_indicator in enumerate(participant_i_indicators)
            ]) for participant_i_indicators in participant_indicators
        ])

        # and also at the paragraph level
        # one per participant
        fields["paragraph_participant_indicators"] = ListField([
            SequenceLabelField([
                indicator for sentence_indicator in participant_i_indicators
                for indicator in sentence_indicator
            ], fields["paragraph"])
            for participant_i_indicators in participant_indicators
        ])

        # Finally, we want to indicate before / inside / after for each sentence.
        paragraph_sentence_indicators: List[SequenceLabelField] = []
        for i in range(len(sentences)):
            before_length = sum(len(sentence) for sentence in sentences[:i])
            sentence_length = len(sentences[i])
            after_length = sum(
                len(sentence) for sentence in sentences[(i + 1):])
            paragraph_sentence_indicators.append(
                SequenceLabelField([0] * before_length +
                                   [1] * sentence_length + [2] * after_length,
                                   fields["paragraph"]))

        fields["paragraph_sentence_indicators"] = ListField(
            paragraph_sentence_indicators)

        # These fields are passed on to the decoder trainer that internally uses it
        # to compute commonsense scores for predicted actions
        fields["para_id"] = MetadataField(para_id)
        fields["participant_strings"] = MetadataField(participants)

        fields["filename"] = MetadataField(filename)

        if score is not None:
            fields["score"] = MetadataField(score)

        return Instance(fields)
class MTClassifierDatasetReader(DatasetReader):
    """
    Reads a file in the MT Classifier assignment format.

    Parameters
    ----------
    source_language : ``str``, optional, (default = 'de_core_news_sm')
        The name of the spaCy model used to tokenize the source sentences.  
        Models can be found here <https://spacy.io/models/>.
    candidate_language : ``str``, optional (default = 'en_core_web_sm')
        The name of the spaCy model uwed to tokenize the candidate sentences.
    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
            The token indexers to be applied to the words TextField.
    """
    def __init__(self,
                 source_language: str = 'de_core_news_sm',
                 candidate_language: str = 'en_core_web_sm',
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self._source_tokenizer = SpacyWordSplitter(language=source_language)
        self._candidate_tokenizer = \
                SpacyWordSplitter(language=candidate_language)


    @overrides
    def _read(self, file_path: str):
        file_path = cached_path(file_path)

        with open(file_path, 'r') as mt_file:
            logger.info("Reading MT instances dataset at: %s", file_path)

            for line in mt_file:
                if not line:
                    continue
                else:
                    inputs = line.strip().split("\t")
                    source = inputs[0]
                    candidate = inputs[1]
                    label = inputs[2]

                    yield self.text_to_instance(source, candidate, label)

    @overrides
    def text_to_instance(self,  # type: ignore
                         source: str,
                         candidate: str,
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        source : ``str``, required
            The translation's source sentence.
        candidate : ``str``, required
            The translation candidate.
        label : ``str``, optional (default = None)
            Whether the candidate is human- or machine-translated, if known.
        """
        fields: Dict[str, Field] = {}

        source_tokens = self._source_tokenizer.split_words(source)
        candidate_tokens = self._candidate_tokenizer.split_words(candidate)

        fields["candidate"] = TextField(candidate_tokens, self._token_indexers)
        fields["source"] = TextField(source_tokens, self._token_indexers)

        tags = " ".join([c[1] for c in nltk.pos_tag(nltk.word_tokenize(candidate))])
        tag_tokens = self._candidate_tokenizer.split_words(tags)
        fields["candidate_pos"] = TextField(tag_tokens, self._token_indexers)

        source_lengths = [len(token) for token in source_tokens]
        candidate_lengths = [len(token) for token in candidate_tokens]

        source_punctuation = [1 if len(token) == 1 else 0 for token in source_tokens]
        candidate_punctuation = [1 if len(token) == 1 else 0 for token in candidate_tokens]
        
        german = ["ä", "ö", "ü", "ß"]
        test = [1 if any(c in token.text for c in german) else 0 for token in candidate_tokens]

        german2 = ["lich", "enz", "ionen", "jek", "stech", "nik"]
        test2 = [1 if any(c in token.text for c in german2) else 0 for token in candidate_tokens]

        features = [
            len(source_tokens),
            len(candidate_tokens),
            sum(source_lengths),
            sum(candidate_lengths),
            sum(source_punctuation),
            sum(candidate_punctuation),
            sum(test),
            sum(test2)
        ]
        fields["features"] = ArrayField(np.array(features))

        # print("source_punctuation")
        # print(source_punctuation)
        if label:
            fields["label"] = LabelField(label)
        
        fields["metadata"] = MetadataField({"source": source, 
                                            "candidate": candidate})

        return Instance(fields)
 def setUp(self):
     super().setUp()
     self.word_splitter = SpacyWordSplitter()
Пример #14
0
 def setUp(self):
     super(TestNerTagIndexer, self).setUp()
     self.tokenizer = SpacyWordSplitter(ner=True)
Пример #15
0
class BiaffineDependencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.BiaffineDependencyParser` model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = "en_core_web_sm") -> None:
        super().__init__(model, dataset_reader)
        # TODO(Mark) Make the language configurable and based on a model attribute.
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a dependency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.

        Returns
        -------
        A dictionary representation of the dependency tree.
        """
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        if self._dataset_reader.use_language_specific_pos:  # type: ignore
            # fine-grained part of speech
            pos_tags = [token.tag_ for token in spacy_tokens]
        else:
            # coarse-grained part of speech (Universal Depdendencies format)
            pos_tags = [token.pos_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        words = outputs["words"]
        pos = outputs["pos"]
        heads = outputs["predicted_heads"]
        tags = outputs["predicted_dependencies"]
        outputs["hierplane_tree"] = self._build_hierplane_tree(
            words, heads, tags, pos)
        return sanitize(outputs)

    @overrides
    def predict_batch_instance(self,
                               instances: List[Instance]) -> List[JsonDict]:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            words = output["words"]
            pos = output["pos"]
            heads = output["predicted_heads"]
            tags = output["predicted_dependencies"]
            output["hierplane_tree"] = self._build_hierplane_tree(
                words, heads, tags, pos)
        return sanitize(outputs)

    @staticmethod
    def _build_hierplane_tree(words: List[str], heads: List[int],
                              tags: List[str],
                              pos: List[str]) -> Dict[str, Any]:
        """
        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """

        word_index_to_cumulative_indices: Dict[int, Tuple[int, int]] = {}
        cumulative_index = 0
        for i, word in enumerate(words):
            word_length = len(word) + 1
            word_index_to_cumulative_indices[i] = (cumulative_index,
                                                   cumulative_index +
                                                   word_length)
            cumulative_index += word_length

        def node_constuctor(index: int):
            children = []
            for next_index, child in enumerate(heads):
                if child == index + 1:
                    children.append(node_constuctor(next_index))

            # These are the icons which show up in the bottom right
            # corner of the node.
            attributes = [pos[index]]
            start, end = word_index_to_cumulative_indices[index]

            hierplane_node = {
                "word": words[index],
                # The type of the node - all nodes with the same
                # type have a unified colour.
                "nodeType": tags[index],
                # Attributes of the node.
                "attributes": attributes,
                # The link between  the node and it's parent.
                "link": tags[index],
                "spans": [{
                    "start": start,
                    "end": end
                }],
            }
            if children:
                hierplane_node["children"] = children
            return hierplane_node

        # We are guaranteed that there is a single word pointing to
        # the root index, so we can find it just by searching for 0 in the list.
        root_index = heads.index(0)
        hierplane_tree = {
            "text": " ".join(words),
            "root": node_constuctor(root_index),
            "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
            "linkToPosition": LINK_TO_POSITION,
        }
        return hierplane_tree
Пример #16
0
class QaSrlParserPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                            pos_tags=True)
        self._model_vocab = model.vocab

        self._verb_map = read_verb_file(
            "data/wiktionary/en_verb_inflections.txt")

        self._pretrained_vectors = read_pretrained_file(
            "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz"
        )

    def _sentence_to_qasrl_instances(
            self, json_dict: JsonDict) -> Tuple[List[Instance], JsonDict]:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        words = [token.text for token in tokens]
        text = " ".join(words)

        result_dict: JsonDict = {"words": words, "verbs": []}

        instances: List[Instance] = []

        verb_indexes = []
        for i, word in enumerate(tokens):
            if word.pos_ == "VERB" and not word.text.lower() in AUX_VERBS:
                verb = word.text
                result_dict["verbs"].append(verb)

                instance = self._dataset_reader._make_instance_from_text(
                    text, i)
                instances.append(instance)
                verb_indexes.append(i)

        return instances, result_dict, words, verb_indexes

    @overrides
    def predict_json(self, inputs: JsonDict, cuda_device: int = 0) -> JsonDict:

        instances, results, words, verb_indexes = self._sentence_to_qasrl_instances(
            inputs)

        # Expand vocab
        cleansed_words = cleanse_sentence_text(words)
        added_words = []
        added_vectors = []
        for w in cleansed_words:
            w = w.lower()
            if self._model_vocab.get_token_index(
                    w) == 1 and w in self._pretrained_vectors:
                added_words.append(w)
                added_vectors.append(self._pretrained_vectors[w])
        if added_words:
            first_ind = self._model_vocab.get_vocab_size("tokens")
            for w in added_words:
                self._model_vocab.add_token_to_namespace(w, "tokens")

            num_added_words = len(added_words)
            added_weights = torch.cat(added_vectors, dim=0)

            span_weights = self._model.span_detector.text_field_embedder.token_embedder_tokens.weight.data
            num_words, embsize = span_weights.size()
            new_weights = span_weights.new().resize_(
                num_words + num_added_words, embsize)
            new_weights[:num_words].copy_(span_weights)
            new_weights[num_words:].copy_(
                torch.reshape(
                    added_weights,
                    (added_weights.shape[0] / new_weights[num_words:].shape[1],
                     added_weights.shape[0] /
                     new_weights[num_words:].shape[0])))
            self._model.span_detector.text_field_embedder.token_embedder_tokens.weight = Parameter(
                new_weights)

            ques_weights = self._model.question_predictor.text_field_embedder.token_embedder_tokens.weight.data
            num_words, embsize = ques_weights.size()
            new_weights = ques_weights.new().resize_(
                num_words + num_added_words, embsize)
            new_weights[:num_words].copy_(ques_weights)
            new_weights[num_words:].copy_(
                torch.reshape(
                    added_weights,
                    (added_weights.shape[0] / new_weights[num_words:].shape[1],
                     added_weights.shape[0] /
                     new_weights[num_words:].shape[0])))
            self._model.question_predictor.text_field_embedder.token_embedder_tokens.weight = Parameter(
                new_weights)

        verbs_for_instances = results["verbs"]
        results["verbs"] = []

        instances_with_spans = []
        instance_spans = []
        if instances:
            span_outputs = self._model.span_detector.forward_on_instances(
                instances)

            for instance, span_output in zip(instances, span_outputs):
                field_dict = instance.fields
                text_field = field_dict['text']

                spans = [s[0] for s in span_output['spans'] if s[1] >= 0.5]
                if len(spans) > 0:
                    instance_spans.append(spans)

                    labeled_span_field = ListField([
                        SpanField(span.start(), span.end(), text_field)
                        for span in spans
                    ])
                    field_dict['labeled_spans'] = labeled_span_field
                    instances_with_spans.append(Instance(field_dict))

        if instances_with_spans:
            outputs = self._model.question_predictor.forward_on_instances(
                instances_with_spans)

            for output, spans, verb, index in zip(outputs, instance_spans,
                                                  verbs_for_instances,
                                                  verb_indexes):
                questions = {}
                for question, span in zip(output['questions'], spans):
                    question_text = self.make_question_text(question, verb)
                    span_text = " ".join([
                        words[i] for i in range(span.start(),
                                                span.end() + 1)
                    ])
                    span_rep = {
                        "start": span.start(),
                        "end": span.end(),
                        "text": span_text
                    }
                    questions.setdefault(question_text, []).append(span_rep)

                qa_pairs = []
                for question, spans in questions.items():
                    qa_pairs.append({"question": question, "spans": spans})

                results["verbs"].append({
                    "verb": verb,
                    "qa_pairs": qa_pairs,
                    "index": index
                })

        return results

    def make_question_text(self, slots, verb):
        slots = list(slots)
        verb_slot = slots[3]
        split = verb_slot.split(" ")
        verb = verb.lower()
        if verb in self._verb_map:
            split[-1] = self._verb_map[verb][split[-1]]
        else:
            split[-1] = verb
        slots[3] = " ".join(split)
        sent_text = " ".join([slot for slot in slots if slot != "_"]) + "?"
        sent_text = sent_text[0].upper() + sent_text[1:]
        return sent_text
Пример #17
0
    def __init__(
        self,
        knowledge_graph: KnowledgeGraph,
        utterance_tokens: List[Token],
        token_indexers: Dict[str, TokenIndexer],
        tokenizer: Tokenizer = None,
        feature_extractors: List[str] = None,
        entity_tokens: List[List[Token]] = None,
        linking_features: List[List[List[float]]] = None,
        include_in_vocab: bool = True,
        max_table_tokens: int = None,
    ) -> None:

        self.knowledge_graph = knowledge_graph
        self._tokenizer = tokenizer or WordTokenizer(
            word_splitter=SpacyWordSplitter(pos_tags=True))
        if not entity_tokens:
            entity_texts = [
                knowledge_graph.entity_text[entity].lower()
                for entity in knowledge_graph.entities
            ]
            # TODO(mattg): Because we do tagging on each of these entities in addition to just
            # tokenizations, this is quite slow, and about half of our data processing time just
            # goes to this (~15 minutes when there are 7k instances).  The reason we do tagging is
            # so that we can add lemma features.  If we can remove the need for lemma / other
            # hand-written features, like with a CNN, we can cut down our data processing time by a
            # factor of 2.
            self.entity_texts = self._tokenizer.batch_tokenize(entity_texts)
        else:
            self.entity_texts = entity_tokens
        self.utterance_tokens = utterance_tokens
        self._token_indexers: Dict[str, TokenIndexer] = token_indexers
        self._include_in_vocab = include_in_vocab
        self._indexed_entity_texts: Dict[str, TokenList] = None
        self._max_table_tokens = max_table_tokens

        feature_extractors = (feature_extractors
                              if feature_extractors is not None else [
                                  "number_token_match",
                                  "exact_token_match",
                                  "contains_exact_token_match",
                                  "lemma_match",
                                  "contains_lemma_match",
                                  "edit_distance",
                                  "related_column",
                                  "related_column_lemma",
                                  "span_overlap_fraction",
                                  "span_lemma_overlap_fraction",
                              ])
        self._feature_extractors: List[Callable[
            [str, List[Token], Token, int, List[Token]], float]] = []
        for feature_extractor_name in feature_extractors:
            extractor = getattr(self, "_" + feature_extractor_name, None)
            if not extractor:
                raise ConfigurationError(
                    f"Invalid feature extractor name: {feature_extractor_name}"
                )
            self._feature_extractors.append(extractor)

        if not linking_features:
            # For quicker lookups in our feature functions, we'll additionally store some
            # dictionaries that map entity strings to useful information about the entity.
            self._entity_text_map: Dict[str, List[Token]] = {}
            for entity, entity_text in zip(knowledge_graph.entities,
                                           self.entity_texts):
                self._entity_text_map[entity] = entity_text

            self._entity_text_exact_text: Dict[str, Set[str]] = {}
            for entity, entity_text in zip(knowledge_graph.entities,
                                           self.entity_texts):
                self._entity_text_exact_text[entity] = set(
                    e.text for e in entity_text)

            self._entity_text_lemmas: Dict[str, Set[str]] = {}
            for entity, entity_text in zip(knowledge_graph.entities,
                                           self.entity_texts):
                self._entity_text_lemmas[entity] = set(e.lemma_
                                                       for e in entity_text)
            self.linking_features = self._compute_linking_features()
        else:
            self.linking_features = linking_features
Пример #18
0
 def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
     super().__init__(model, dataset_reader)
     self._tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(
         pos_tags=True))
     self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
Пример #19
0
class ConstituencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                            pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a constituency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.

        Returns
        -------
        A dictionary representation of the constituency tree.
        """
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self,
                          json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        pos_tags = [token.tag_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text,
                                                     pos_tags), {}

    @overrides
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        instance, return_dict = self._json_to_instance(inputs)
        outputs = self._model.forward_on_instance(instance)
        return_dict.update(outputs)

        # format the NLTK tree as a string on a single line.
        tree = return_dict.pop("trees")
        return_dict["hierplane_tree"] = self._build_hierplane_tree(
            tree, 0, is_root=True)
        return_dict["trees"] = tree.pformat(margin=1000000)
        return sanitize(return_dict)

    @overrides
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        instances, return_dicts = zip(*self._batch_json_to_instances(inputs))
        outputs = self._model.forward_on_instances(instances)
        for output, return_dict in zip(outputs, return_dicts):
            return_dict.update(output)
            # format the NLTK tree as a string on a single line.
            tree = return_dict.pop("trees")
            return_dict["hierplane_tree"] = self._build_hierplane_tree(
                tree, 0, is_root=True)
            return_dict["trees"] = tree.pformat(margin=1000000)
        return sanitize(return_dicts)

    def _build_hierplane_tree(self, tree: Tree, index: int,
                              is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        Parameters
        ----------
        tree : ``Tree``, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(
                    self._build_hierplane_tree(child, index, is_root=False))
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        hierplane_node = {
            "word": span,
            "nodeType": label,
            "attributes": [label],
            "link": label
        }
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                "linkNameToLabel": LINK_TO_LABEL,
                "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                "text": span,
                "root": hierplane_node
            }
        return hierplane_node
Пример #20
0
from allennlp.data.instance import Instance
from allennlp.data.fields import TextField
from allennlp.data.vocabulary import Vocabulary
import torch

import bert_indexer

logger = logging.getLogger()
#
# parser = argparse.ArgumentParser('description: experiments on datasets')
# parser.add_argument('input_file')
# parser.add_argument('output_file')
# args = parser.parse_args()

tokenizer = WordTokenizer(
    word_splitter=SpacyWordSplitter(pos_tags=True, ner=True))
token_indexer = bert_indexer.PretrainedBertIndexer(
    '../TransformerCoqa/bert-base-uncased-vocab.txt',
    do_lowercase=False,
    max_pieces=8,
    doc_stride=3)
token_embedder = PretrainedBertEmbedder(
    '../TransformerCoqa/bert-base-uncased.tar.gz')

# with open(args.input_file, 'w') as f:
#     data = json.load(f)['data']
#
# for article in data:
#     story = article['story']

a = "the man went to the store and bought a gallon of milk"
Пример #21
0
 def __init__(self, model, dataset_reader):
     super(SentenceTaggerPredictor, self).__init__(model, dataset_reader)
     self._tokenizer = SpacyWordSplitter(language=u'en_core_web_sm',
                                         pos_tags=True)
Пример #22
0
 def setUp(self):
     super(TestPosTagIndexer, self).setUp()
     self.tokenizer = SpacyWordSplitter(pos_tags=True)
 def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
     super().__init__(model, dataset_reader)
     # TODO(Mark) Make the language configurable and based on a model attribute.
     self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)
Пример #24
0
class SentenceTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance(tokens)

    @overrides
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        """
        This function currently only handles BIOUL tags.

        Imagine an NER model predicts three named entities (each one with potentially
        multiple tokens). For each individual entity, we create a new Instance that has
        the label set to only that entity and the rest of the tokens are labeled as outside.
        We then return a list of those Instances.

        For example:
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O   U-Loc  O   O     B-Org     L-Org

        We create three instances.
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O    O     O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O   U-LOC  O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O    O     O   O     B-Org     L-Org
        """
        predicted_tags = outputs['tags']
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == 'U':
                current_tags = [
                    t if idx == i else 'O'
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == 'B':
                begin_idx = i
                while tag[0] != 'L':
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [t if idx >= begin_idx and idx <= end_idx else 'O' \
                    for idx, t in enumerate(predicted_tags)]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = deepcopy(instance)
            text_field: TextField = instance['tokens']  # type: ignore
            new_instance.add_field('tags',
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            instances.append(new_instance)

        return instances
Пример #25
0
class SimpleSeq2SeqPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.encoder_decoder.simple_seq2seq` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language="en_core_web_sm")

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)
        del outputs["logits"]
        del outputs["class_probabilities"]
        return sanitize(outputs)

    def predict(self, source: str) -> JsonDict:
        pred_json = self.predict_json({"source": source})
        return pred_json

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"source": "..."}``.
        """
        # print(json_dict)
        paragraph_json = json_dict
        all_questions = paragraph_json['questions']
        golden_answers = paragraph_json['answers']
        paragraph_id = paragraph_json['id']

        # READ THE BIDAF++ OUTPUTS
        bidafplus_output_filename = os.path.join(
            os.path.dirname(os.path.realpath(file_path)),
            'bidafplus_output_formatted.json')
        with open(bidafplus_output_filename) as bidafplus_outputs:
            best_span_str_json = json.load(bidafplus_outputs)
            best_span_str = best_span_str_json['data']

        # extractive outputs from BIDAF++
        best_span_str_list = best_span_str[paragraph_id]

        # metadata
        metadata = {}
        metadata['paragraph_id'] = paragraph_id
        metadata['questions'] = [
            ques["input_text"].strip().replace("\n", "")
            for ques in all_questions
        ][:15]

        questions_list = [
            ques["input_text"].strip().replace("\n", "")
            for ques in all_questions
        ][:15]
        golden_rationale_list = [
            answer['span_text'].strip().replace("\n", "")
            for answer in golden_answers
        ][:15]
        answers_list = [
            answer['input_text'].strip().replace("\n", "")
            for answer in golden_answers
        ][:15]
        bidafplus_rationale_list = [
            answer['answer_text'].strip().replace("\n", "")
            for answer in best_span_str_list
        ][:15]
        ques_rat_list = [
            ' '.join([
                bidafplus_rationale_list[i], self.question_tag,
                questions_list[i]
            ]) for i in range(len(questions_list))
        ]
        for i in range(len(questions_list)):
            yield self.text_to_instance(ques_rat_list[i], answers_list[i],
                                        paragraph_id, i)
            # yield self.text_to_instance(rationale_list[i], answers_list[i])

    def text_to_instance(self,
                         source_string: str,
                         target_string: str = None,
                         paragraph_id: str = None,
                         turn_id: int = 0) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_source = self._tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._token_indexers)
        if target_string is not None:
            tokenized_target = self._tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._token_indexers)
            return Instance({"source_tokens": source_field})
        else:
            return Instance({"source_tokens": source_field})
Пример #26
0
 def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
     super().__init__(model, dataset_reader)
     # TODO(Mark) Make the language configurable and based on a model attribute.
     self._tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                         pos_tags=True)
Пример #27
0
class SemanticRoleLabelerPredictor(Predictor):
    """
    Wrapper for the :class:`~allennlp.models.bidaf.SemanticRoleLabeler` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                            pos_tags=True)

    @staticmethod
    def make_srl_string(words: List[str], tags: List[str]) -> str:
        frame = []
        chunk = []

        for (token, tag) in zip(words, tags):
            if tag.startswith("I-"):
                chunk.append(token)
            else:
                if chunk:
                    frame.append("[" + " ".join(chunk) + "]")
                    chunk = []

                if tag.startswith("B-"):
                    chunk.append(tag[2:] + ": " + token)
                elif tag == "O":
                    frame.append(token)

        if chunk:
            frame.append("[" + " ".join(chunk) + "]")

        return " ".join(frame)

    @overrides
    def _json_to_instance(self, json_dict: JsonDict):
        raise NotImplementedError(
            "The SRL model uses a different API for creating instances.")

    def _sentence_to_srl_instances(
            self, json_dict: JsonDict) -> Tuple[List[Instance], JsonDict]:
        """
        The SRL model has a slightly different API from other models, as the model is run
        forward for every verb in the sentence. This means that for a single sentence, we need
        to generate a ``List[Instance]``, where the length of this list corresponds to the number
        of verbs in the sentence. Additionally, all of these verbs share the same return dictionary
        after being passed through the model (as really we care about all the frames of the sentence
        together, rather than separately).

        Parameters
        ----------
        json_dict : ``JsonDict``, required.
            JSON that looks like ``{"sentence": "..."}``.

        Returns
        -------
        instances : ``List[Instance]``
            One instance per verb.
        result_dict : ``JsonDict``
            A dictionary containing the words of the sentence and the verbs extracted
            by the Spacy POS tagger. These will be replaced in ``predict_json`` with the
            SRL frame for the verb.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        words = [token.text for token in tokens]
        result_dict: JsonDict = {"words": words, "verbs": []}
        instances: List[Instance] = []
        for i, word in enumerate(tokens):
            if word.pos_ == "VERB":
                verb = word.text
                result_dict["verbs"].append(verb)
                verb_labels = [0 for _ in words]
                verb_labels[i] = 1
                instance = self._dataset_reader.text_to_instance(
                    tokens, verb_labels)
                instances.append(instance)
        return instances, result_dict

    @overrides
    def predict_batch_json(self,
                           inputs: List[JsonDict],
                           cuda_device: int = -1) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence, return_dicts = zip(
            *[self._sentence_to_srl_instances(json) for json in inputs])

        flattened_instances = [
            instance for sentence_instances in instances_per_sentence
            for instance in sentence_instances
        ]

        if not flattened_instances:
            return sanitize(return_dicts)

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size,
                                           None)
        batched_instances[-1] = [
            instance for instance in batched_instances[-1]
            if instance is not None
        ]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(
                batch, cuda_device))

        sentence_index = 0
        for results in return_dicts:
            # We just added the verbs to the list in _sentence_to_srl_instances
            # but we actually want to replace them with their frames, so we
            # reset them here.
            verbs_for_sentence: List[str] = results["verbs"]
            results["verbs"] = []
            # The verbs are in order, but nested as we have multiple sentences.
            # The outputs are already flattened from running through the model,
            # so we just index into this flat list for each verb, updating as we go.
            for verb in verbs_for_sentence:
                output = outputs[sentence_index]
                tags = output['tags']
                description = self.make_srl_string(results["words"], tags)
                results["verbs"].append({
                    "verb": verb,
                    "description": description,
                    "tags": tags,
                })
                sentence_index += 1

        return sanitize(return_dicts)

    @overrides
    def predict_json(self,
                     inputs: JsonDict,
                     cuda_device: int = -1) -> JsonDict:
        """
        Expects JSON that looks like ``{"sentence": "..."}``
        and returns JSON that looks like

        .. code-block:: js

            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]}
        """
        instances, results = self._sentence_to_srl_instances(inputs)
        # We just added the verbs to the list in _sentence_to_srl_instances
        # but we actually want to replace them with their frames, so we
        # reset them here.
        verbs_for_instances: List[str] = results["verbs"]
        results["verbs"] = []

        if not instances:
            return sanitize(results)

        outputs = self._model.forward_on_instances(instances, cuda_device)

        for output, verb in zip(outputs, verbs_for_instances):
            tags = output['tags']
            description = self.make_srl_string(results["words"], tags)
            results["verbs"].append({
                "verb": verb,
                "description": description,
                "tags": tags,
            })

        return sanitize(results)
Пример #28
0
def train(model_dir):

    # prepare data
    #reader = CoqaDatasetReader()
    #reader = CoqaDatasetReader(tokenizer=lambda x: WordTokenizer().tokenize(text=x))
    #reader = LanguageModelingReader(tokenizer=WordTokenizer(word_splitter=SpacyWordSplitter(language='en_core_web_sm')))
    reader = SimpleLanguageModelingDatasetReader(tokenizer=WordTokenizer(
        word_splitter=SpacyWordSplitter(language='en_core_web_sm')))
    train_dataset = reader.read(
        cached_path(
            '/mnt/DATA/ML/data/corpora/QA/CoQA/stories_only/coqa-train-v1.0_extract100.json'
        ))
    validation_dataset = reader.read(
        cached_path(
            '/mnt/DATA/ML/data/corpora/QA/CoQA/stories_only/coqa-dev-v1.0.json'
        ))

    vocab = None
    model_fn = os.path.join(model_dir, 'model.th')
    vocab_fn = os.path.join(model_dir, 'vocab')
    if os.path.exists(model_dir):
        if os.path.exists(vocab_fn):
            logging.info('load vocab from: %s...' % vocab_fn)
            vocab = Vocabulary.from_files(vocab_fn)
    else:
        os.makedirs(model_dir)
    if vocab is None:
        #vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
        vocab = Vocabulary.from_instances(train_dataset)
        #TODO: re-add!
        #vocab.extend_from_instances(validation_dataset)
        logging.info('save vocab to: %s...' % vocab_fn)
        vocab.save_to_files(vocab_fn)
    logging.info('data prepared')

    model = create_model(vocab)

    if os.path.exists(model_fn):
        logging.info('load model wheights from: %s...' % model_fn)
        with open(model_fn, 'rb') as f:
            model.load_state_dict(torch.load(f))
    logging.info('model prepared')

    # prepare training
    # optimizer = optim.SGD(model.parameters(), lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=10)
    logging.info('training prepared')

    trainer.train()

    logging.info('save model to: %s...' % model_fn)
    with open(model_fn, 'wb') as f:
        torch.save(model.state_dict(), f)
Пример #29
0
from numpy import dot
from numpy.linalg import norm
import json
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter

tokenizer = SpacyWordSplitter(language='en_core_web_sm', keep_spacy_tokens=True)

def cosine_similarity(a, b):
    return dot(a, b) / (norm(a) * norm(b))


def extract_keys(lines, key: str):
    return [json[key] for json in lines]


def get_from_rankings(rankings, dictionary):
    return [dictionary[index] for _, index in rankings]


def split_data(data, dev_start: float, test_start: float, is_test: bool):
    train, dev, test = split_all_data(data, dev_start, test_start)
    if is_test:
        return train, test
    else:
        return train, dev


def split_all_data(data, dev_start: float, test_start: float):
    return (data[:int(dev_start * len(data))],
            data[int(dev_start * len(data)): int(test_start * len(data))],
            data[int(test_start * len(data)):])
 def tokenizer(x: str):
     return [
         w.text for w in SpacyWordSplitter(language='en_core_web_sm',
                                           pos_tags=False).split_words(x)
     ]
class TestSpacyWordSplitter(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.word_splitter = SpacyWordSplitter()

    def test_tokenize_handles_complex_punctuation(self):
        sentence = "this (sentence) has 'crazy' \"punctuation\"."
        expected_tokens = [
            "this",
            "(",
            "sentence",
            ")",
            "has",
            "'",
            "crazy",
            "'",
            '"',
            "punctuation",
            '"',
            ".",
        ]
        tokens = self.word_splitter.split_words(sentence)
        token_text = [t.text for t in tokens]
        assert token_text == expected_tokens
        for token in tokens:
            start = token.idx
            end = start + len(token.text)
            assert sentence[start:end] == token.text

    def test_tokenize_handles_contraction(self):
        # note that "would've" is kept together, while "ain't" is not.
        sentence = "it ain't joe's problem; would been yesterday"
        expected_tokens = [
            "it",
            "ai",
            "n't",
            "joe",
            "'s",
            "problem",
            ";",
            "would",
            "been",
            "yesterday",
        ]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_multiple_contraction(self):
        sentence = "wouldn't've"
        expected_tokens = ["would", "n't", "'ve"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_final_apostrophe(self):
        sentence = "the jones' house"
        expected_tokens = ["the", "jones", "'", "house"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_removes_whitespace_tokens(self):
        sentence = "the\n jones'   house  \x0b  55"
        expected_tokens = ["the", "jones", "'", "house", "55"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_special_cases(self):
        # note that the etc. doesn't quite work --- we can special case this if we want.
        sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store"
        expected_tokens = [
            "Mr.",
            "and",
            "Mrs.",
            "Jones",
            ",",
            "etc",
            ".",
            ",",
            "went",
            "to",
            ",",
            "e.g.",
            ",",
            "the",
            "store",
        ]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_batch_tokenization(self):
        sentences = [
            "This is     a sentence",
            "This isn't a sentence.",
            "This is the 3rd     sentence." "Here's the 'fourth' sentence.",
        ]
        batch_split = self.word_splitter.batch_split_words(sentences)
        separately_split = [self.word_splitter.split_words(sentence) for sentence in sentences]
        assert len(batch_split) == len(separately_split)
        for batch_sentence, separate_sentence in zip(batch_split, separately_split):
            assert len(batch_sentence) == len(separate_sentence)
            for batch_word, separate_word in zip(batch_sentence, separate_sentence):
                assert batch_word.text == separate_word.text

    def test_keep_spacy_tokens(self):
        word_splitter = SpacyWordSplitter()
        sentence = "This should be an allennlp Token"
        tokens = word_splitter.split_words(sentence)
        assert tokens
        assert all(isinstance(token, Token) for token in tokens)

        word_splitter = SpacyWordSplitter(keep_spacy_tokens=True)
        sentence = "This should be a spacy Token"
        tokens = word_splitter.split_words(sentence)
        assert tokens
        assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
class CoQAPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language="en_core_web_sm")

    def predict(self, jsonline: str) -> JsonDict:
        return self.predict_json(json.loads(jsonline))

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects json that looks like the original data file.
        """
        file_path = cached_path(file_path)
        logger.info("Reading file at %s", file_path)

        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json["data"]

        logger.info("Reading the dataset...")

        paragraph_json = json_dict

        # for paragraph_json in dataset:
        paragraph = paragraph_json["story"]
        tokenized_paragraph = self._tokenizer.split_words(paragraph)
        questions = paragraph_json["questions"]
        golden_answers = paragraph_json["answers"]
        self.handle_unknown_answers(golden_answers, len(paragraph))
        metadata = {}
        paragraph_id = paragraph_json["id"]
        metadata["instance_id"] = [str(paragraph_id) + "_" + str(ques["turn_id"]) for ques in questions]

        if (len(metadata["instance_id"]) > 15):
            metadata["instance_id"] = metadata["instance_id"][:15]

        question_text_list = [ques["input_text"].strip().replace("\n", "") for ques in questions]
        if (len(question_text_list) > 15):
            question_text_list = question_text_list[:15]

        answer_texts_list = [[answer["span_text"]] for answer in golden_answers]
        if (len(answer_texts_list) > 15):
            answer_texts_list = answer_texts_list[:15]

        metadata["question"] = question_text_list
        metadata["answer_texts_list"] = answer_texts_list

        span_start_list = [[answer["span_start"]] for answer in golden_answers]
        span_end_list = [[answer["span_end"]] for answer in golden_answers]
        if (len(span_end_list) > 15):
            span_end_list = span_end_list[:15]

        # for st_list, an_list in zip(span_starts_list, answer_texts_list):
        #     span_ends = [start + len(answer) for start, answer in zip(st_list, an_list)]
        #     span_ends_list.append(span_ends)

        yesno_list = [str("x") for ques in questions][:15]
        followup_list = [str("n") for ques in questions][:15]
        instance = self._dataset_reader.text_to_instance(question_text_list,
                                                         paragraph,
                                                         span_start_list,
                                                         span_end_list,
                                                         tokenized_paragraph,
                                                         yesno_list,
                                                         followup_list,
                                                         metadata)
        return instance

    def text_to_instance(self,  # type: ignore
                         question_text_list: List[str],
                         passage_text: str,
                         start_span_list: List[List[int]] = None,
                         end_span_list: List[List[int]] = None,
                         passage_tokens: List[Token] = None,
                         yesno_list: List[int] = None,
                         followup_list: List[int] = None,
                         additional_metadata: Dict[str, Any] = None) -> Instance:
        # pylint: disable=arguments-differ
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we"ll actually use for supervision.
        answer_token_span_list = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        for start_list, end_list in zip(start_span_list, end_span_list):
            token_spans: List[Tuple[int, int]] = []
            for char_span_start, char_span_end in zip(start_list, end_list):
                (span_start, span_end), error = my_util.char_span_to_token_span(passage_offsets,
                                                                             (char_span_start, char_span_end))
                if error:
                    logger.debug("Passage: %s", passage_text)
                    logger.debug("Passage tokens: %s", passage_tokens)
                    logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                    logger.debug("Token span: (%d, %d)", span_start, span_end)
                    logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                    logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
                token_spans.append((span_start, span_end))
            answer_token_span_list.append(token_spans)
        question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list]
        # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
        additional_metadata["answer_texts_list"] = [util.handle_cannot(ans_list) for ans_list \
                                                    in additional_metadata["answer_texts_list"]]
        return util.make_reading_comprehension_instance_quac(question_list_tokens,
                                                             passage_tokens,
                                                             self._token_indexers,
                                                             passage_text,
                                                             answer_token_span_list,
                                                             yesno_list,
                                                             followup_list,
                                                             additional_metadata,
                                                             self._num_context_answers)

    def handle_unknown_answers(self, answers, plen):
        for ans in answers:
            if ans["span_start"] < 0:
                ans["span_start"] = 0
            if ans["span_end"] < 0:
                ans["span_end"] = plen - 1
 def __init__(self,
              model: Model,
              dataset_reader: DatasetReader,
              language: str = 'en_core_web_sm') -> None:
     super().__init__(model, dataset_reader)
     self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)
Пример #34
0
class SemanticRoleLabelerPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.bidaf.SemanticRoleLabeler` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predicts the semantic roles of the supplied sentence and returns a dictionary
        with the results.

        .. code-block:: js

            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]}

        Parameters
        ----------
        sentence, ``str``
            The sentence to parse via semantic role labeling.

        Returns
        -------
        A dictionary representation of the semantic roles in the sentence.
        """
        return self.predict_json({"sentence" : sentence})


    @staticmethod
    def make_srl_string(words: List[str], tags: List[str]) -> str:
        frame = []
        chunk = []

        for (token, tag) in zip(words, tags):
            if tag.startswith("I-"):
                chunk.append(token)
            else:
                if chunk:
                    frame.append("[" + " ".join(chunk) + "]")
                    chunk = []

                if tag.startswith("B-"):
                    chunk.append(tag[2:] + ": " + token)
                elif tag == "O":
                    frame.append(token)

        if chunk:
            frame.append("[" + " ".join(chunk) + "]")

        return " ".join(frame)

    @overrides
    def _json_to_instance(self, json_dict: JsonDict):
        raise NotImplementedError("The SRL model uses a different API for creating instances.")

    def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]:
        """
        The SRL model has a slightly different API from other models, as the model is run
        forward for every verb in the sentence. This means that for a single sentence, we need
        to generate a ``List[Instance]``, where the length of this list corresponds to the number
        of verbs in the sentence. Additionally, all of these verbs share the same return dictionary
        after being passed through the model (as really we care about all the frames of the sentence
        together, rather than separately).

        Parameters
        ----------
        json_dict : ``JsonDict``, required.
            JSON that looks like ``{"sentence": "..."}``.

        Returns
        -------
        instances : ``List[Instance]``
            One instance per verb.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        words = [token.text for token in tokens]
        instances: List[Instance] = []
        for i, word in enumerate(tokens):
            if word.pos_ == "VERB":
                verb_labels = [0 for _ in words]
                verb_labels[i] = 1
                instance = self._dataset_reader.text_to_instance(tokens, verb_labels)
                instances.append(instance)
        return instances

    @overrides
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence = [self._sentence_to_srl_instances(json) for json in inputs]

        flattened_instances = [instance for sentence_instances in instances_per_sentence
                               for instance in sentence_instances]

        if not flattened_instances:
            return sanitize([{"verbs": [], "words": self._tokenizer.split_words(x["sentence"])}
                             for x in inputs])

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size, None)
        batched_instances[-1] = [instance for instance in batched_instances[-1]
                                 if instance is not None]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(batch))

        verbs_per_sentence = [len(sent) for sent in instances_per_sentence]
        return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs]

        output_index = 0
        for sentence_index, verb_count in enumerate(verbs_per_sentence):
            if verb_count == 0:
                # We didn't run any predictions for sentences with no verbs,
                # so we don't have a way to extract the original sentence.
                # Here we just tokenize the input again.
                original_text = self._tokenizer.split_words(inputs[sentence_index]["sentence"])
                return_dicts[sentence_index]["words"] = original_text
                continue

            for _ in range(verb_count):
                output = outputs[output_index]
                words = output["words"]
                tags = output['tags']
                description = self.make_srl_string(words, tags)
                return_dicts[sentence_index]["words"] = words
                return_dicts[sentence_index]["verbs"].append({
                        "verb": output["verb"],
                        "description": description,
                        "tags": tags,
                })
                output_index += 1

        return sanitize(return_dicts)

    @overrides
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        Expects JSON that looks like ``{"sentence": "..."}``
        and returns JSON that looks like

        .. code-block:: js

            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]}
        """
        instances = self._sentence_to_srl_instances(inputs)

        if not instances:
            return sanitize({"verbs": [], "words": self._tokenizer.split_words(inputs["sentence"])})

        outputs = self._model.forward_on_instances(instances)

        results = {"verbs": [], "words": outputs[0]["words"]}
        for output in outputs:
            tags = output['tags']
            description = self.make_srl_string(output["words"], tags)
            results["verbs"].append({
                    "verb": output["verb"],
                    "description": description,
                    "tags": tags,
            })

        return sanitize(results)
 def setUp(self):
     super(TestNerTagIndexer, self).setUp()
     self.tokenizer = SpacyWordSplitter(ner=True)
Пример #36
0
class MTBDatasetReader(DatasetReader):
    """
    Reads a JSON-lines file containing papers from the Semantic Scholar database, and creates a
    dataset suitable for document classification using these papers.

    Expected format for each input line: {"paperAbstract": "text", "title": "text", "venue": "text"}

    The JSON could have other fields, too, but they are ignored.

    The output of ``read`` is a list of ``Instance`` s with the fields:
        title: ``TextField``
        abstract: ``TextField``
        label: ``LabelField``

    where the ``label`` is derived from the venue of the paper.

    Parameters
    ----------
    lazy : ``bool`` (optional, default=False)
        Passed to ``DatasetReader``.  If this is ``True``, training will start sooner, but will
        take longer per batch.  This also allows training with datasets that are too large to fit
        in memory.
    tokenizer : ``Tokenizer``, optional
        Tokenizer to use to split the title and abstrct into words or other kinds of tokens.
        Defaults to ``WordTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        Indexers used to define input token representations. Defaults to ``{"tokens":
        SingleIdTokenIndexer()}``.
    """
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self.spacy_splitter = SpacyWordSplitter(keep_spacy_tokens=True)
        self.TRAIN_DATA = "meta_train"
        self.TEST_DATA = "meta_test"

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from json files at: %s", data_file)
            data = json.load(data_file)
            labels = data[1]
            data = data[0]
            for x, l in zip(data, labels):
                yield self.text_to_instance(x, l)

    @overrides
    def text_to_instance(
            self,
            data: dict,
            relation_type: int = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        N_relations = []
        location_list = []
        all_tokens_sentences = []
        for i, K_examples in enumerate(data[self.TRAIN_DATA]):
            toknized_sentences = []
            sentences_location = []
            clean_text_for_debug = []
            for relation in K_examples:
                tokenized_tokens = self._tokenizer.tokenize(" ".join(
                    relation["tokens"]))
                head_location, tail_location = self.addStartEntityTokens(
                    tokenized_tokens, relation['h'], relation['t'])

                assert tokenized_tokens[head_location].text == head_start_token
                assert tokenized_tokens[tail_location].text == tail_start_token

                field_of_tokens = TextField(tokenized_tokens,
                                            self._token_indexers)
                locations_of_entities = MetadataField({
                    "head": head_location,
                    "tail": tail_location
                })
                clean_text_for_debug.append(MetadataField(tokenized_tokens))

                sentences_location.append(locations_of_entities)
                toknized_sentences.append(field_of_tokens)
            assert len(sentences_location) == len(toknized_sentences) == len(
                clean_text_for_debug)

            sentences_location = ListField(sentences_location)
            clean_text_for_debug = ListField(clean_text_for_debug)
            toknized_sentences = ListField(toknized_sentences)

            all_tokens_sentences.append(clean_text_for_debug)
            location_list.append(sentences_location)
            N_relations.append(toknized_sentences)

        assert len(N_relations) == len(location_list) == len(
            all_tokens_sentences)
        N_relations = ListField(N_relations)
        location_list = ListField(location_list)
        all_tokens_sentences = ListField(all_tokens_sentences)
        fields = {
            'sentences': N_relations,
            "locations": location_list,
            "clean_tokens": all_tokens_sentences
        }

        test_dict = data[self.TEST_DATA]
        tokenized_tokens = self._tokenizer.tokenize(" ".join(
            test_dict["tokens"]))
        head_location, tail_location = self.addStartEntityTokens(
            tokenized_tokens, test_dict['h'], test_dict['t'])
        test_clean_text_for_debug = MetadataField(tokenized_tokens)
        locations_of_entities = MetadataField({
            "head": head_location,
            "tail": tail_location
        })
        field_of_tokens = TextField(tokenized_tokens, self._token_indexers)

        fields['test'] = field_of_tokens
        fields['test_location'] = locations_of_entities
        fields['test_clean_text'] = test_clean_text_for_debug

        if relation_type is not None:
            fields['label'] = IndexField(relation_type, N_relations)
        return Instance(fields)

    def addStartEntityTokens(self, tokens_list, head_full_data,
                             tail_full_data):
        if len(head_full_data[0]) > len(
                tail_full_data[0]
        ):  # this is for handling nested tail and head entities
            #for example: head = NEC and tail = NEC corp
            # solution, make sure no overlapping entities mention
            head_start_location, head_end_location = self.find_locations(
                head_full_data, tokens_list)
            tail_start_location, tail_end_location = self.find_locations(
                tail_full_data, tokens_list)
            if tail_start_location[0] >= head_start_location[
                    0] and tail_start_location[0] <= head_end_location[0]:
                tail_end_location, tail_start_location = self.deny_overlapping(
                    tokens_list, head_end_location, tail_full_data)

        else:
            tail_start_location, tail_end_location = self.find_locations(
                tail_full_data, tokens_list)
            head_start_location, head_end_location = self.find_locations(
                head_full_data, tokens_list)
            if head_start_location[0] >= tail_start_location[
                    0] and head_start_location[0] <= tail_end_location[0]:
                head_end_location, head_start_location = self.deny_overlapping(
                    tokens_list, tail_end_location, head_full_data)

        # todo try different approchs on which entity location to choose
        h_start_location, head_end_location, tail_start_location, tail_end_location = find_closest_distance_between_entities \
            (head_start_location, head_end_location, tail_start_location, tail_end_location)

        x = self._tokenizer.tokenize(head_start_token)
        y = self._tokenizer.tokenize(head_end_token)
        z = self._tokenizer.tokenize(tail_start_token)
        w = self._tokenizer.tokenize(tail_end_token)

        offset_tail = 2 * (tail_start_location > h_start_location)
        tokens_list.insert(h_start_location,
                           x[0])  # arbetrary pick a token for that
        tokens_list.insert(head_end_location + 1 + 1,
                           y[0])  # arbetrary pick a token for that
        tokens_list.insert(tail_start_location + offset_tail,
                           z[0])  # arbetrary pick a token for that
        tokens_list.insert(tail_end_location + 2 + offset_tail,
                           w[0])  # arbetrary pick a token for that

        return h_start_location + 2 - offset_tail, tail_start_location + offset_tail

    def deny_overlapping(self, tokens_list, longest_entity_end_location,
                         shortest_entity_full_data):
        start_location, end_location = self.find_locations(
            shortest_entity_full_data,
            tokens_list[longest_entity_end_location[0] + 1:])
        start_location[0] = start_location[0] + longest_entity_end_location[0]
        end_location[0] = end_location[0] + longest_entity_end_location[0]
        return end_location, start_location

    def return_lower_text_from_tokens(self, tokens):
        return list(map(lambda x: x.text.lower(), tokens))

    def compare_two_token_lists(self, x, y):
        return self.return_lower_text_from_tokens(
            x) == self.return_lower_text_from_tokens(y)

    def spacy_work_toknizer(self, text):
        return list(
            map(lambda x: x.text, self.spacy_splitter.split_words(text)))

    def find_locations(self, head_full_data, token_list):
        end_location, start_location = self._find_entity_name(
            token_list, head_full_data)
        if len(end_location) == 0 or len(start_location) == 0:
            end_location, start_location = self._find_entity_name(
                token_list, head_full_data, True)

        assert len(start_location) == len(end_location)
        assert len(start_location) == len(head_full_data[2])

        return start_location, end_location

    def _find_entity_name(self,
                          token_list,
                          head_full_data,
                          use_spacy_toknizer_before=False):
        if use_spacy_toknizer_before:
            spacy_head_tokens = self.spacy_work_toknizer(head_full_data[0])
            head = self._tokenizer.tokenize(" ".join(spacy_head_tokens))
        else:
            head = self._tokenizer.tokenize(" ".join([head_full_data[0]]))
        start_head_entity_name = head[0]
        start_location = []
        end_location = []
        for i, token in enumerate(token_list):
            if self.compare_two_token_lists([token], [start_head_entity_name]):
                if self.compare_two_token_lists(token_list[i:i + len(head)],
                                                head):
                    start_location.append(i)
                    end_location.append(i + len(head) - 1)
                    if len(start_location) == len(head_full_data[2]):
                        break
        return end_location, start_location
Пример #37
0
 def __init__(self, language):
     super().__init__()
     self.tokenizer = SpacyWordSplitter(language=language, pos_tags=True)
Пример #38
0
 def setUp(self):
     super().setUp()
     self.tokenizer = SpacyWordSplitter(pos_tags=True)
class BiaffineDependencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.BiaffineDependencyParser` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        # TODO(Mark) Make the language configurable and based on a model attribute.
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a dependency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.

        Returns
        -------
        A dictionary representation of the dependency tree.
        """
        return self.predict_json({"sentence" : sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        if self._dataset_reader.use_language_specific_pos: # type: ignore
            # fine-grained part of speech
            pos_tags = [token.tag_ for token in spacy_tokens]
        else:
            # coarse-grained part of speech (Universal Depdendencies format)
            pos_tags = [token.pos_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        words = outputs["words"]
        pos = outputs["pos"]
        heads = outputs["predicted_heads"]
        tags = outputs["predicted_dependencies"]
        outputs["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos)
        return sanitize(outputs)

    @overrides
    def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            words = output["words"]
            pos = output["pos"]
            heads = output["predicted_heads"]
            tags = output["predicted_dependencies"]
            output["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos)
        return sanitize(outputs)

    @staticmethod
    def _build_hierplane_tree(words: List[str],
                              heads: List[int],
                              tags: List[str],
                              pos: List[str]) -> Dict[str, Any]:
        """
        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """

        word_index_to_cumulative_indices: Dict[int, Tuple[int, int]] = {}
        cumulative_index = 0
        for i, word in enumerate(words):
            word_length = len(word) + 1
            word_index_to_cumulative_indices[i] = (cumulative_index, cumulative_index + word_length)
            cumulative_index += word_length

        def node_constuctor(index: int):
            children = []
            for next_index, child in enumerate(heads):
                if child == index + 1:
                    children.append(node_constuctor(next_index))

            # These are the icons which show up in the bottom right
            # corner of the node.
            attributes = [pos[index]]
            start, end = word_index_to_cumulative_indices[index]

            hierplane_node = {
                    "word": words[index],
                    # The type of the node - all nodes with the same
                    # type have a unified colour.
                    "nodeType": tags[index],
                    # Attributes of the node.
                    "attributes": attributes,
                    # The link between  the node and it's parent.
                    "link": tags[index],
                    "spans": [{"start": start, "end": end}]
            }
            if children:
                hierplane_node["children"] = children
            return hierplane_node
        # We are guaranteed that there is a single word pointing to
        # the root index, so we can find it just by searching for 0 in the list.
        root_index = heads.index(0)
        hierplane_tree = {
                "text": " ".join(words),
                "root": node_constuctor(root_index),
                "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                "linkToPosition": LINK_TO_POSITION
        }
        return hierplane_tree
Пример #40
0
class TestPosTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.tokenizer = SpacyWordSplitter(pos_tags=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            'DT': 2,
            'VBZ': 1,
            '.': 1,
            'NN': 1,
            'NONE': 2
        }

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            'VERB': 1,
            'PUNCT': 1,
            'DET': 2,
            'NOUN': 1,
            'NONE': 2
        }

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace('DET', namespace='pos_tags')
        vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
        vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')

        indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {
            "coarse": [cop_index]
        }

    def test_padding_functions(self):
        indexer = PosTagIndexer()
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = PosTagIndexer()
        padded_tokens = indexer.as_padded_tensor({'key': [1, 2, 3, 4, 5]},
                                                 {'key': 10}, {})
        assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]

    def test_blank_pos_tag(self):
        tokens = [
            Token(token)._replace(pos_="")
            for token in "allennlp is awesome .".split(" ")
        ]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        # spacy uses a empty string to indicate "no POS tag"
        # we convert it to "NONE"
        assert counter["pos_tokens"]["NONE"] == 4
        vocab = Vocabulary(counter)
        none_index = vocab.get_token_index('NONE', 'pos_tokens')
        # should raise no exception
        indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos")
        assert {
            "pos": [none_index, none_index, none_index, none_index]
        } == indices
Пример #41
0
class ConstituencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a constituency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.

        Returns
        -------
        A dictionary representation of the constituency tree.
        """
        return self.predict_json({"sentence" : sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        pos_tags = [token.tag_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        # format the NLTK tree as a string on a single line.
        tree = outputs.pop("trees")
        outputs["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
        outputs["trees"] = tree.pformat(margin=1000000)
        return sanitize(outputs)

    @overrides
    def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            # format the NLTK tree as a string on a single line.
            tree = output.pop("trees")
            output["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
            output["trees"] = tree.pformat(margin=1000000)
        return sanitize(outputs)


    def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        Parameters
        ----------
        tree : ``Tree``, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(self._build_hierplane_tree(child, index, is_root=False))
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        hierplane_node = {
                "word": span,
                "nodeType": label,
                "attributes": [label],
                "link": label
        }
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                    "linkNameToLabel": LINK_TO_LABEL,
                    "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                    "text": span,
                    "root": hierplane_node
            }
        return hierplane_node
 def setUp(self):
     super(TestDepLabelIndexer, self).setUp()
     self.tokenizer = SpacyWordSplitter(parse=True)
 def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
     super().__init__(model, dataset_reader)
     self._tokenizer = SpacyWordSplitter(language="en_core_web_sm")
 def setUp(self):
     super(TestPosTagIndexer, self).setUp()
     self.tokenizer = SpacyWordSplitter(pos_tags=True)
Пример #45
0
 def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
     super().__init__(model, dataset_reader)
     self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)
Пример #46
0
 def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
     super().__init__(model, dataset_reader)
     self._tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(
         pos_tags=True))