class TestDepLabelIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestDepLabelIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(parse=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = DepLabelIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1, "advmod": 3, "NONE": 2}

    def test_token_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
        none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
        indexer = DepLabelIndexer()
        assert indexer.token_to_indices(tokens[1], vocab) == root_index
        assert indexer.token_to_indices(tokens[-1], vocab) == none_index

    def test_padding_functions(self):
        indexer = DepLabelIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = DepLabelIndexer()
        padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
        assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
示例#2
0
class TestNerTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestNerTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(ner=True)

    def test_count_vocab_items_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["ner_tags"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}

    def test_token_to_indices_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
        vocab.add_token_to_namespace('ORG', namespace='ner_tags')
        indexer = NerTagIndexer()
        assert indexer.token_to_indices(tokens[1], vocab) == person_index
        assert indexer.token_to_indices(tokens[-1], vocab) == none_index

    def test_padding_functions(self):
        indexer = NerTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = NerTagIndexer()
        padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
        assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
class TestNerTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestNerTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(ner=True)

    def test_count_vocab_items_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["ner_tokens"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}

    def test_tokens_to_indices_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        person_index = vocab.add_token_to_namespace('PERSON',
                                                    namespace='ner_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
        vocab.add_token_to_namespace('ORG', namespace='ner_tags')
        indexer = NerTagIndexer(namespace='ner_tags')
        assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {
            "tokens1": [person_index]
        }
        assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {
            "tokens-1": [none_index]
        }

    def test_padding_functions(self):
        indexer = NerTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = NerTagIndexer()
        padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]},
                                                   {'key': 10}, {})
        assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}

    def test_blank_ner_tag(self):
        tokens = [
            Token(token)._replace(ent_type_="")
            for token in "allennlp is awesome .".split(" ")
        ]
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        # spacy uses a empty string to indicate "no NER tag"
        # we convert it to "NONE"
        assert counter["ner_tokens"]["NONE"] == 4
        vocab = Vocabulary(counter)
        none_index = vocab.get_token_index('NONE', 'ner_tokens')
        # should raise no exception
        indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
        assert {
            "ner": [none_index, none_index, none_index, none_index]
        } == indices
示例#4
0
class TestDepLabelIndexer(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.tokenizer = SpacyWordSplitter(parse=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = DepLabelIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1,
                                         "det": 1, "NONE": 2, "attr": 1, "punct": 1}

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
        none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
        indexer = DepLabelIndexer()
        assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]}
        assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}

    def test_padding_functions(self):
        indexer = DepLabelIndexer()
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = DepLabelIndexer()
        padded_tokens = indexer.as_padded_tensor({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
        assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
class TestDepLabelIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestDepLabelIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(parse=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = DepLabelIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1,
                                         "det": 1, "NONE": 2, "attr": 1, "punct": 1}

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
        none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
        indexer = DepLabelIndexer()
        assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]}
        assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}

    def test_padding_functions(self):
        indexer = DepLabelIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = DepLabelIndexer()
        padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
        assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
示例#6
0
class MILU(NLU):
    """Multi-intent language understanding model."""
    def __init__(
            self,
            archive_file=DEFAULT_ARCHIVE_FILE,
            cuda_device=DEFAULT_CUDA_DEVICE,
            model_file="https://convlab.blob.core.windows.net/convlab-2/milu_multiwoz_all_context.tar.gz",
            context_size=3):
        """ Constructor for NLU class. """

        self.context_size = context_size
        cuda_device = 0 if torch.cuda.is_available() else DEFAULT_CUDA_DEVICE
        check_for_gpu(cuda_device)

        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for MILU is specified!")

            archive_file = cached_path(model_file)

        archive = load_archive(archive_file, cuda_device=cuda_device)
        self.tokenizer = SpacyWordSplitter(language="en_core_web_sm")
        _special_case = [{ORTH: u"id", LEMMA: u"id"}]
        self.tokenizer.spacy.tokenizer.add_special_case(u"id", _special_case)

        dataset_reader_params = archive.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
        self.model = archive.model
        self.model.eval()

    def predict(self, utterance, context=list()):
        """
        Predict the dialog act of a natural language utterance and apply error model.
        Args:
            utterance (str): A natural language utterance.
        Returns:
            output (dict): The dialog act of utterance.
        """
        if len(utterance) == 0:
            return []

        if self.context_size > 0 and len(context) > 0:
            context_tokens = sum([
                self.tokenizer.split_words(utterance + " SENT_END")
                for utterance in context[-self.context_size:]
            ], [])
        else:
            context_tokens = self.tokenizer.split_words("SENT_END")
        tokens = self.tokenizer.split_words(utterance)
        instance = self.dataset_reader.text_to_instance(context_tokens, tokens)
        outputs = self.model.forward_on_instance(instance)

        tuples = []
        for domain_intent, svs in outputs['dialog_act'].items():
            for slot, value in svs:
                domain, intent = domain_intent.split('-')
                tuples.append([intent, domain, slot, value])
        return tuples
示例#7
0
class DialogQAPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm')

    def predict(self, jsonline: str) -> JsonDict:
        out = self.predict_json(json.loads(jsonline))
        print('OUT:')
        print(out)
        return out

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects json that looks like the original quac data file.
        """
        paragraph_json = json_dict[0]['paragraphs'][0]
        paragraph = paragraph_json['context']
        paragraph_pos = paragraph_json['context_pos']
        tokenized_paragraph = self._tokenizer.split_words(paragraph)
        qas = paragraph_json['qas']
        metadata = {}
        metadata["instance_id"] = [qa['id'] for qa in qas]
        question_text_list = [
            qa["question"].strip().replace("\n", "") for qa in qas
        ]
        question_pos_list = [qa["question_pos"] for qa in qas]
        answer_texts_list = [[qa['answer']] for qa in qas]
        answer_pos_list = [
            paragraph_pos[qa['answer_start']:qa['answer_start'] +
                          len(qa['answer'])] for qa in qas
        ]
        metadata["answer_texts_list"] = answer_texts_list
        metadata["question_tokens"] = [
            self._tokenizer.split_words(q) for q in question_text_list
        ]
        metadata["question_pos"] = question_pos_list
        metadata["answer_pos"] = answer_pos_list
        metadata["passage_tokens"] = [tokenized_paragraph]
        metadata["predict"] = True
        span_starts_list = [[qa['answer_start']] for qa in qas]
        span_ends_list = []
        for st_list, an_list in zip(span_starts_list, answer_texts_list):
            span_ends = [
                start + len(answer) for start, answer in zip(st_list, an_list)
            ]
            span_ends_list.append(span_ends)
        yesno_list = [str(qa['yesno']) for qa in qas]
        followup_list = [str(qa['followup']) for qa in qas]
        instance = self._dataset_reader.text_to_instance(
            question_text_list, question_pos_list, paragraph, paragraph_pos,
            span_starts_list, span_ends_list, tokenized_paragraph, yesno_list,
            followup_list, metadata)
        return instance
    def test_keep_spacy_tokens(self):
        word_splitter = SpacyWordSplitter()
        sentence = "This should be an allennlp Token"
        tokens = word_splitter.split_words(sentence)
        assert tokens
        assert all(isinstance(token, Token) for token in tokens)

        word_splitter = SpacyWordSplitter(keep_spacy_tokens=True)
        sentence = "This should be a spacy Token"
        tokens = word_splitter.split_words(sentence)
        assert tokens
        assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
示例#9
0
class MILU(NLU):
    """Multi-intent language understanding model."""
    def __init__(self,
                 archive_file=DEFAULT_ARCHIVE_FILE,
                 cuda_device=DEFAULT_CUDA_DEVICE,
                 model_file=None,
                 context_size=3):
        """ Constructor for NLU class. """

        self.context_size = context_size

        check_for_gpu(cuda_device)

        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for MILU is specified!")

            archive_file = cached_path(model_file)

        archive = load_archive(archive_file, cuda_device=cuda_device)
        self.tokenizer = SpacyWordSplitter(language="en_core_web_sm")
        _special_case = [{ORTH: u"id", LEMMA: u"id"}]
        self.tokenizer.spacy.tokenizer.add_special_case(u"id", _special_case)

        dataset_reader_params = archive.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
        self.model = archive.model
        self.model.eval()

    def parse(self, utterance, context=[]):
        """
        Predict the dialog act of a natural language utterance and apply error model.
        Args:
            utterance (str): A natural language utterance.
        Returns:
            output (dict): The dialog act of utterance.
        """
        if len(utterance) == 0:
            return {}

        if self.context_size > 0 and len(context) > 0:
            context_tokens = sum([
                self.tokenizer.split_words(utterance + " SENT_END")
                for utterance in context[-self.context_size:]
            ], [])
        else:
            context_tokens = self.tokenizer.split_words("SENT_END")
        tokens = self.tokenizer.split_words(utterance)
        instance = self.dataset_reader.text_to_instance(context_tokens, tokens)
        outputs = self.model.forward_on_instance(instance)

        return outputs["dialog_act"]
示例#10
0
class TestPosTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestPosTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(pos_tags=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words(u"This is a sentence.")
        tokens = [Token(u"<S>")] + [t for t in tokens] + [Token(u"</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter[u"pos_tags"] == {u'DT': 2, u'VBZ': 1, u'.': 1, u'NN': 1, u'NONE': 2}

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter[u"pos_tags"] == {u'VERB': 1, u'PUNCT': 1, u'DET': 2, u'NOUN': 1, u'NONE': 2}

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words(u"This is a sentence.")
        tokens = [t for t in tokens] + [Token(u"</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace(u'VERB', namespace=u'pos_tags')
        cop_index = vocab.add_token_to_namespace(u'VBZ', namespace=u'pos_tags')
        none_index = vocab.add_token_to_namespace(u'NONE', namespace=u'pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace(u'DET', namespace=u'pos_tags')
        vocab.add_token_to_namespace(u'NOUN', namespace=u'pos_tags')
        vocab.add_token_to_namespace(u'PUNCT', namespace=u'pos_tags')

        indexer = PosTagIndexer(coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, u"tokens")
        assert len(indices) == 1
        assert u"tokens" in indices
        assert indices[u"tokens"][1] == verb_index
        assert indices[u"tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, u"coarse") == {u"coarse": [cop_index]}

    def test_padding_functions(self):
        indexer = PosTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = PosTagIndexer()
        padded_tokens = indexer.pad_token_sequence({u'key': [1, 2, 3, 4, 5]}, {u'key': 10}, {})
        assert padded_tokens == {u'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
示例#11
0
class TestPosTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestPosTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(pos_tags=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace('DET', namespace='pos_tags')
        vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
        vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')

        indexer = PosTagIndexer(coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}

    def test_padding_functions(self):
        indexer = PosTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = PosTagIndexer()
        padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
        assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
示例#12
0
class TestNerTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestNerTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(ner=True)

    def test_count_vocab_items_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["ner_tokens"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}

    def test_tokens_to_indices_uses_ner_tags(self):
        tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
        vocab.add_token_to_namespace('ORG', namespace='ner_tags')
        indexer = NerTagIndexer(namespace='ner_tags')
        assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [person_index]}
        assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}

    def test_padding_functions(self):
        indexer = NerTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = NerTagIndexer()
        padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
        assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}

    def test_blank_ner_tag(self):
        tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
        for token in tokens:
            token.ent_type_ = ""
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        # spacy uses a empty string to indicate "no NER tag"
        # we convert it to "NONE"
        assert counter["ner_tokens"]["NONE"] == 4
        vocab = Vocabulary(counter)
        none_index = vocab.get_token_index('NONE', 'ner_tokens')
        # should raise no exception
        indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
        assert {"ner": [none_index, none_index, none_index, none_index]} == indices
示例#13
0
class TestSpacyWordSplitter(AllenNlpTestCase):
    def setUp(self):
        super(TestSpacyWordSplitter, self).setUp()
        self.word_splitter = SpacyWordSplitter()

    def test_tokenize_handles_complex_punctuation(self):
        sentence = "this (sentence) has 'crazy' \"punctuation\"."
        expected_tokens = [
            "this", "(", "sentence", ")", "has", "'", "crazy", "'", '"',
            "punctuation", '"', "."
        ]
        tokens = self.word_splitter.split_words(sentence)
        token_text = [t.text for t in tokens]
        assert token_text == expected_tokens
        for token in tokens:
            start = token.idx
            end = start + len(token.text)
            assert sentence[start:end] == token.text

    def test_tokenize_handles_contraction(self):
        # note that "would've" is kept together, while "ain't" is not.
        sentence = "it ain't joe's problem; would've been yesterday"
        expected_tokens = [
            "it", "ai", "n't", "joe", "'s", "problem", ";", "would've", "been",
            "yesterday"
        ]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_multiple_contraction(self):
        sentence = "wouldn't've"
        expected_tokens = ["would", "n't", "'ve"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_final_apostrophe(self):
        sentence = "the jones' house"
        expected_tokens = ["the", "jones", "'", "house"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_special_cases(self):
        # note that the etc. doesn't quite work --- we can special case this if we want.
        sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store"
        expected_tokens = [
            "Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to",
            ",", "e.g.", ",", "the", "store"
        ]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens
示例#14
0
class VanillaDependencyParserPredictor(Predictor):
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'xx_ent_wiki_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        if self._dataset_reader.use_language_specific_pos:  # type: ignore
            # fine-grained part of speech
            pos_tags = [token.tag_ for token in spacy_tokens]
        else:
            # coarse-grained part of speech (Universal Depdendencies format)
            pos_tags = [token.pos_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def dump_line(self, outputs: JsonDict) -> str:
        words = outputs["words"]
        pos = outputs["pos"]
        heads = outputs["predicted_heads"]
        tags = outputs["predicted_dependencies"]
        return ''.join([('{0}\t{1}\t{1}\t{2}\t{2}\t_\t{3}\t{4}\t_\t_\n'.format(
                i + 1, words[i], pos[i], heads[i], tags[i])) \
                        for i in range(len(words))]) + '\n'
def allen_spacy_tokeniser(text: str) -> Callable[[str], List[str]]:
    '''
    Returns the allennlp English spacy tokeniser as a callable function which 
    takes a String and returns a List of tokens/Strings.
    '''
    splitter = SpacyWordSplitter()
    return [token.text for token in splitter.split_words(text)]
class IntentParamPredictor(Predictor):
    """"Predictor wrapper for the IntentClassifier"""
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)
        # solve the kernel size greater than input size error
        #self._dataset_reader._token_indexers['token_characters']._min_padding_length = 5

    def predict_json(self, inputs: JsonDict) -> JsonDict:
        instance = self._json_to_instance(inputs)
        output_dict = self.predict_instance(instance)
        # label_dict will be like {0: "positive_preference", 1: "no_intent", ...}
        label_dict = self._model.vocab.get_index_to_token_vocabulary('labels')
        # Convert it to list ["positive_preference", "no_intent", ...]
        all_labels = [label_dict[i] for i in range(len(label_dict))]
        output_dict["all_labels"] = all_labels
        output_dict["user_utterance"] = inputs['user_utterance']
        return output_dict

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        user_utterance = json_dict['user_utterance']
        prev_user_utterance = json_dict['prev_user_utterance']
        prev_sys_utterance = json_dict['prev_sys_utterance']
        tokens = self._tokenizer.split_words(user_utterance)
        intent = json_dict['class']
        return self._dataset_reader.text_to_instance(
            user_utterance=user_utterance,
            prev_user_utterance=prev_user_utterance,
            prev_sys_utterance=prev_sys_utterance,
            tokens=tokens,
            intent=intent)
示例#17
0
class BiaffineDependencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.BiaffineDependencyParser` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        # TODO(Mark) Make the language configurable and based on a model attribute.
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a dependency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.

        Returns
        -------
        A dictionary representation of the dependency tree.
        """
        return self.predict_json({"sentence" : sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        pos_tags = [token.tag_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)
class SentenceTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)

    @overrides
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        instance = self._json_to_instance(inputs)
        #return format_ner_result(self.predict_instance(instance))
        return self.predict_instance(instance)

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        #sentence = " ".join(json_dict["sentence"].strip().replace(" ", ""))
        #print(sentence)
        tokens = self._tokenizer.split_words(sentence)
        #tokens = sentence
        return self._dataset_reader.text_to_instance(tokens)
示例#19
0
class ParamTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a user_utterance and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)
        # solve the kernel size greater than input size error
        self._dataset_reader._token_indexers[
            'token_characters']._min_padding_length = 5

    def predict(self, user_utterance: str) -> JsonDict:
        return self.predict_json({"user_utterance": user_utterance})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"user_utterance": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        user_utterance = json_dict["user_utterance"]
        tokens = self._tokenizer.split_words(user_utterance)
        return self._dataset_reader.text_to_instance(tokens)
示例#20
0
def load_data(data_path: str,
              tokenize: bool = False,
              tokenizer_type: str = "just_spaces") -> List[str]:
    if tokenizer_type == "just_spaces":
        tokenizer = SpacyWordSplitter()
    elif tokenizer_type == "spacy":
        nlp = spacy.load('en')
        tokenizer = Tokenizer(nlp.vocab)
    tokenized_examples = []
    with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
        for line in f:
            if data_path.endswith(".jsonl") or data_path.endswith(".json"):
                example = json.loads(line)
            else:
                example = {"text": line}
            if tokenize:
                if tokenizer_type == 'just_spaces':
                    tokens = list(
                        map(str, tokenizer.split_words(example['text'])))
                elif tokenizer_type == 'spacy':
                    tokens = list(map(str, tokenizer(example['text'])))
                text = ' '.join(tokens)
            else:
                text = example['text']
            tokenized_examples.append(text)
    return tokenized_examples
示例#21
0
class SentenceTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                            pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance(tokens)
示例#22
0
class UniversalDependenciesRawDatasetReader(DatasetReader):
    """Like UniversalDependenciesDatasetReader, but reads raw sentences and tokenizes them first."""
    def __init__(self,
                 dataset_reader: DatasetReader,
                 tokenizer: WordSplitter = None) -> None:
        super().__init__(lazy=dataset_reader.lazy)
        self.dataset_reader = dataset_reader
        if tokenizer:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = SpacyWordSplitter(language="xx_ent_wiki_sm")

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as conllu_file:
            for sentence in conllu_file:
                if sentence:
                    words = [
                        word.text
                        for word in self.tokenizer.split_words(sentence)
                    ]
                    yield self.text_to_instance(words)

    @overrides
    def text_to_instance(self, words: List[str]) -> Instance:
        return self.dataset_reader.text_to_instance(words)
示例#23
0
class DialogQAPredictor(Predictor):
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language)

    def predict(self, jsonline: str) -> JsonDict:
        """
        Make a dialog-style question answering prediction on the supplied input.
        The supplied input json must contain a list of
        question answer pairs, containing question, answer, yesno, followup, id
        as well as the context (passage).

        Parameters
        ----------
        jsonline: ``str``
            A json line that has the same format as the quac data file.

        Returns
        ----------
        A dictionary that represents the prediction made by the system.  The answer string will be under the
        "best_span_str" key.
        """
        return self.predict_json(json.loads(jsonline))

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects json that looks like the original quac data file.
        """
        paragraph_json = json_dict["paragraphs"][0]
        paragraph = paragraph_json['context']
        tokenized_paragraph = self._tokenizer.split_words(paragraph)
        qas = paragraph_json['qas']
        metadata = {}
        metadata["instance_id"] = [qa['id'] for qa in qas]
        question_text_list = [
            qa["question"].strip().replace("\n", "") for qa in qas
        ]
        answer_texts_list = [[answer['text'] for answer in qa['answers']]
                             for qa in qas]
        metadata["answer_texts_list"] = answer_texts_list
        metadata["question"] = question_text_list
        span_starts_list = [
            [answer['answer_start'] for answer in qa['answers']] for qa in qas
        ]
        span_ends_list = []
        for st_list, an_list in zip(span_starts_list, answer_texts_list):
            span_ends = [
                start + len(answer) for start, answer in zip(st_list, an_list)
            ]
            span_ends_list.append(span_ends)
        yesno_list = [str(qa['yesno']) for qa in qas]
        followup_list = [str(qa['followup']) for qa in qas]
        instance = self._dataset_reader.text_to_instance(
            question_text_list, paragraph, span_starts_list, span_ends_list,
            tokenized_paragraph, yesno_list, followup_list, metadata)
        return instance
def load_data(data_path: str,
              tokenize: bool = False,
              tokenizer_type: str = "just_spaces",
              token_field_names: List[str] = ["text"]) -> Dict[str, List[str]]:
    if tokenizer_type == "just_spaces":
        tokenizer = SpacyWordSplitter()
    elif tokenizer_type == "spacy":
        nlp = spacy.load('en')
        tokenizer = Tokenizer(nlp.vocab)

    named_tokenized_examples = {
        token_field_name: []
        for token_field_name in token_field_names
    }

    with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
        for line in f:
            for token_field_name in token_field_names:
                example = json.loads(line)
                assert token_field_name in example
                if tokenize:
                    if tokenizer_type == 'just_spaces':
                        tokens = list(
                            map(
                                str,
                                tokenizer.split_words(
                                    example[token_field_name])))
                    elif tokenizer_type == 'spacy':
                        tokens = list(
                            map(str, tokenizer(example[token_field_name])))
                    text = ' '.join(tokens)
                else:
                    text = example[token_field_name]
                named_tokenized_examples[token_field_name].append(text)
    return named_tokenized_examples
class DependencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.BiaffineDependencyParser` model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        # TODO(Mark) Make the language configurable and based on a model attribute.
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a dependency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.
        Returns
        -------
        A dictionary representation of the dependency tree.
        """
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        if self._dataset_reader.use_language_specific_pos:  # type: ignore
            # fine-grained part of speech
            pos_tags = [token.tag_ for token in spacy_tokens]
        else:
            # coarse-grained part of speech (Universal Depdendencies format)
            pos_tags = [token.pos_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        words = outputs["words"]
        pos = outputs["pos"]
        heads = outputs["predicted_heads"]
        tags = outputs["predicted_dependencies"]
        return outputs

    @overrides
    def predict_batch_instance(self, instances: List[Instance]) -> JsonDict:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            words = output["words"]
            pos = output["pos"]
            heads = output["predicted_heads"]
            tags = output["predicted_dependencies"]
        return outputs
示例#26
0
class NEREmbedding(TokenEmbedder):
    def __init__(self, serialization_dir, cuda_device=0) -> None:
        super(NEREmbedding, self).__init__()

        from allennlp.models.archival import load_archive

        self.serialization_dir = serialization_dir
        self.parameter_filename = os.path.join(serialization_dir,
                                               "config.json")
        self.weights_filename = os.path.join(serialization_dir, "weights.th")
        self.cuda_device = cuda_device

        self.config = Params.from_file(self.parameter_filename)
        self.archive = load_archive(self.serialization_dir)
        self.model = self.archive.model
        self.model.eval()
        self.dataset_reader_params = self.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(
            self.dataset_reader_params)
        self.tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                           ner=True,
                                           wst=True)

    def forward(self, inputs):
        texts = self.inputs_to_texts(inputs)
        instances = self.texts_to_instances(texts)
        dataset = Batch(instances)
        dataset.index_instances(self.model.vocab)
        cp_inputs = util.move_to_device(dataset.as_tensor_dict(),
                                        self.cuda_device)
        tokens = cp_inputs['tokens']
        embedded_text_input = self.model.text_field_embedder(tokens)
        mask = get_text_field_mask(tokens)
        encoded_text = self.model.encoder(embedded_text_input, mask)
        return encoded_text.detach()

    def texts_to_instances(self, texts):
        instances = []
        for text in texts:
            tokens = self.tokenizer.split_words(text)
            instance = self.dataset_reader.text_to_instance(tokens)
            instances.append(instance)
        return instances

    def inputs_to_texts(self, inputs, k='words'):
        texts = [' '.join(x[k]) for x in inputs['metadata']]
        return texts

    @classmethod
    def from_params(cls, vocab: Vocabulary, params: Params):
        serialization_dir = params.pop('serialization_dir')
        cuda_device = params.pop_int('cuda_device')
        return cls(serialization_dir, cuda_device)

    def get_output_dim(self) -> int:
        return self.model.encoder.get_output_dim()
示例#27
0
class DialogQAPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language)

    def predict(self, jsonline: str) -> JsonDict:
        """
        Make a dialog-style question answering prediction on the supplied input.
        The supplied input json must contain a list of
        question answer pairs, containing question, answer, yesno, followup, id
        as well as the context (passage).

        Parameters
        ----------
        jsonline: ``str``
            A json line that has the same format as the quac data file.

        Returns
        ----------
        A dictionary that represents the prediction made by the system.  The answer string will be under the
        "best_span_str" key.
        """
        return self.predict_json(json.loads(jsonline))

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects json that looks like the original quac data file.
        """
        paragraph_json = json_dict["paragraphs"][0]
        paragraph = paragraph_json['context']
        tokenized_paragraph = self._tokenizer.split_words(paragraph)
        qas = paragraph_json['qas']
        metadata = {}
        metadata["instance_id"] = [qa['id'] for qa in qas]
        question_text_list = [qa["question"].strip().replace("\n", "") for qa in qas]
        answer_texts_list = [[answer['text'] for answer in qa['answers']] for qa in qas]
        metadata["answer_texts_list"] = answer_texts_list
        metadata["question"] = question_text_list
        span_starts_list = [[answer['answer_start'] for answer in qa['answers']] for qa in qas]
        span_ends_list = []
        for st_list, an_list in zip(span_starts_list, answer_texts_list):
            span_ends = [start + len(answer) for start, answer in zip(st_list, an_list)]
            span_ends_list.append(span_ends)
        yesno_list = [str(qa['yesno']) for qa in qas]
        followup_list = [str(qa['followup']) for qa in qas]
        instance = self._dataset_reader.text_to_instance(question_text_list,
                                                         paragraph,
                                                         span_starts_list,
                                                         span_ends_list,
                                                         tokenized_paragraph,
                                                         yesno_list,
                                                         followup_list,
                                                         metadata)
        return instance
示例#28
0
class TestPosTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestPosTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(pos_tags=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}

    def test_token_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        indexer = PosTagIndexer(coarse_tags=True)
        assert indexer.token_to_indices(tokens[1], vocab) == verb_index
        assert indexer.token_to_indices(tokens[-1], vocab) == none_index
        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.token_to_indices(tokens[1], vocab) == cop_index

    def test_padding_functions(self):
        indexer = PosTagIndexer()
        assert indexer.get_padding_token() == 0
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = PosTagIndexer()
        padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
        assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
示例#29
0
class SentenceClassifierPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence" : sentence})

    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance([str(t) for t in tokens])
示例#30
0
class JointPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader):
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                            pos_tags=True)

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        if 'tokens' in json_dict:
            tokens = json_dict['tokens']
        else:
            tokens = self._tokenizer.split_words(json_dict['sentence'])
        return self._dataset_reader.text_to_instance(tokens=tokens)
示例#31
0
class SentenceClassifierPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm',
                                            pos_tags=True)

    @overrides
    def predict_json(self, json_dict: JsonDict) -> JsonDict:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        inst = self._dataset_reader.text_to_instance([str(t) for t in tokens])
        logits = self.predict_instance(inst)['logits']
        label_id = np.argmax(logits)
        return self._model.vocab.get_token_from_index(label_id, 'labels')
class DifferenceTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)
        self.model = model

    @overrides
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        instance = self._json_to_instance(inputs)
        self.model.vocab.extend_from_instances(None, instances=[instance])
        #self.model.extend_embedder_vocab(embedding_sources_mapping)
        output = self.predict_instance(instance)
        print('predict json', output)

        table = BeautifulTable(max_width=120)
        table.set_style(BeautifulTable.STYLE_RST)
        table.append_column('words', output['words'])
        table.append_column('tags', output['tags'])
        table.append_column('logits', [
            " ".join([str(round(f, 1)) for f in l]) for l in output['logits']
        ])

        print('predict json', output)
        output = list(zip(output['tags'], output['words']))
        return str(table).replace('\n', '<br />')

        return list(zip(result['tags'], result['words']))

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        print("HALLO json")
        print(json_dict)

        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance(tokens)
def allenNLP_split_words(context):
    """Uses AllenNLP to conduct word splitting, as an alternative to mystring.split()"""
    # # Split context (alternative method)
    # from allennlp.predictors import SentenceTaggerPredictor
    # predictor = SentenceTaggerPredictor([],[])
    # context_split = predictor._tokenizer.split_words(context);
    #
    from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
    ws = SpacyWordSplitter()
    context_split = ws.split_words(context)
    # Convert context_split to str's
    for i, c in enumerate(context_split):
        context_split[i] = str(context_split[i])

    return context_split
示例#34
0
class UniversalDependenciesRawDatasetReader(UniversalDependenciesDatasetReader
                                            ):
    def __init__(self, language):
        super().__init__()
        self.tokenizer = SpacyWordSplitter(language=language, pos_tags=True)

    @overrides
    def load(self, file_path):
        file_path = cached_path(file_path)
        counter = 1
        with open(file_path, 'r') as conllu_file:
            for sentence in conllu_file:
                if sentence:
                    words = [
                        word.text
                        for word in self.tokenizer.split_words(sentence)
                    ]
                    upos_tags = [
                        word.tag_
                        for word in self.tokenizer.split_words(sentence)
                    ]
                    xpos_tags = upos_tags
                    seq_len = len(words)
                    ids = [i + 1 for i in range(seq_len)]
                    lemmas = ["_" for i in range(seq_len)]
                    feats = lemmas
                    heads = [1 for i in range(seq_len)]
                    dep_rels = ["<UNK>" for i in range(seq_len)]
                    multiword_ids = []
                    multiword_forms = []
                    sentence = UD_Sentence(ids, words, lemmas, upos_tags,
                                           xpos_tags, feats, heads, dep_rels,
                                           multiword_ids, multiword_forms)
                    self.sentences.append(sentence)
                    self.ids.append(counter)
                    counter = counter + 1
class TestSpacyWordSplitter(AllenNlpTestCase):
    def setUp(self):
        super(TestSpacyWordSplitter, self).setUp()
        self.word_splitter = SpacyWordSplitter()

    def test_tokenize_handles_complex_punctuation(self):
        sentence = "this (sentence) has 'crazy' \"punctuation\"."
        expected_tokens = ["this", "(", "sentence", ")", "has", "'", "crazy", "'", '"',
                           "punctuation", '"', "."]
        tokens = self.word_splitter.split_words(sentence)
        token_text = [t.text for t in tokens]
        assert token_text == expected_tokens
        for token in tokens:
            start = token.idx
            end = start + len(token.text)
            assert sentence[start:end] == token.text

    def test_tokenize_handles_contraction(self):
        # note that "would've" is kept together, while "ain't" is not.
        sentence = "it ain't joe's problem; would been yesterday"
        expected_tokens = ["it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been",
                           "yesterday"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_multiple_contraction(self):
        sentence = "wouldn't've"
        expected_tokens = ["would", "n't", "'ve"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_final_apostrophe(self):
        sentence = "the jones' house"
        expected_tokens = ["the", "jones", "'", "house"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_removes_whitespace_tokens(self):
        sentence = "the\n jones'   house  \x0b  55"
        expected_tokens = ["the", "jones", "'", "house", "55"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_special_cases(self):
        # note that the etc. doesn't quite work --- we can special case this if we want.
        sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store"
        expected_tokens = ["Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to", ",",
                           "e.g.", ",", "the", "store"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens
示例#36
0
class SentenceTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence" : sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance(tokens)
示例#37
0
class ConstituencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a constituency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.

        Returns
        -------
        A dictionary representation of the constituency tree.
        """
        return self.predict_json({"sentence" : sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        pos_tags = [token.tag_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        # format the NLTK tree as a string on a single line.
        tree = outputs.pop("trees")
        outputs["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
        outputs["trees"] = tree.pformat(margin=1000000)
        return sanitize(outputs)

    @overrides
    def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            # format the NLTK tree as a string on a single line.
            tree = output.pop("trees")
            output["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
            output["trees"] = tree.pformat(margin=1000000)
        return sanitize(outputs)


    def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        Parameters
        ----------
        tree : ``Tree``, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(self._build_hierplane_tree(child, index, is_root=False))
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        hierplane_node = {
                "word": span,
                "nodeType": label,
                "attributes": [label],
                "link": label
        }
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                    "linkNameToLabel": LINK_TO_LABEL,
                    "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                    "text": span,
                    "root": hierplane_node
            }
        return hierplane_node
示例#38
0
class SemanticRoleLabelerPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.bidaf.SemanticRoleLabeler` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predicts the semantic roles of the supplied sentence and returns a dictionary
        with the results.

        .. code-block:: js

            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]}

        Parameters
        ----------
        sentence, ``str``
            The sentence to parse via semantic role labeling.

        Returns
        -------
        A dictionary representation of the semantic roles in the sentence.
        """
        return self.predict_json({"sentence" : sentence})


    @staticmethod
    def make_srl_string(words: List[str], tags: List[str]) -> str:
        frame = []
        chunk = []

        for (token, tag) in zip(words, tags):
            if tag.startswith("I-"):
                chunk.append(token)
            else:
                if chunk:
                    frame.append("[" + " ".join(chunk) + "]")
                    chunk = []

                if tag.startswith("B-"):
                    chunk.append(tag[2:] + ": " + token)
                elif tag == "O":
                    frame.append(token)

        if chunk:
            frame.append("[" + " ".join(chunk) + "]")

        return " ".join(frame)

    @overrides
    def _json_to_instance(self, json_dict: JsonDict):
        raise NotImplementedError("The SRL model uses a different API for creating instances.")

    def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]:
        """
        The SRL model has a slightly different API from other models, as the model is run
        forward for every verb in the sentence. This means that for a single sentence, we need
        to generate a ``List[Instance]``, where the length of this list corresponds to the number
        of verbs in the sentence. Additionally, all of these verbs share the same return dictionary
        after being passed through the model (as really we care about all the frames of the sentence
        together, rather than separately).

        Parameters
        ----------
        json_dict : ``JsonDict``, required.
            JSON that looks like ``{"sentence": "..."}``.

        Returns
        -------
        instances : ``List[Instance]``
            One instance per verb.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        words = [token.text for token in tokens]
        instances: List[Instance] = []
        for i, word in enumerate(tokens):
            if word.pos_ == "VERB":
                verb_labels = [0 for _ in words]
                verb_labels[i] = 1
                instance = self._dataset_reader.text_to_instance(tokens, verb_labels)
                instances.append(instance)
        return instances

    @overrides
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence = [self._sentence_to_srl_instances(json) for json in inputs]

        flattened_instances = [instance for sentence_instances in instances_per_sentence
                               for instance in sentence_instances]

        if not flattened_instances:
            return sanitize([{"verbs": [], "words": self._tokenizer.split_words(x["sentence"])}
                             for x in inputs])

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size, None)
        batched_instances[-1] = [instance for instance in batched_instances[-1]
                                 if instance is not None]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(batch))

        verbs_per_sentence = [len(sent) for sent in instances_per_sentence]
        return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs]

        output_index = 0
        for sentence_index, verb_count in enumerate(verbs_per_sentence):
            if verb_count == 0:
                # We didn't run any predictions for sentences with no verbs,
                # so we don't have a way to extract the original sentence.
                # Here we just tokenize the input again.
                original_text = self._tokenizer.split_words(inputs[sentence_index]["sentence"])
                return_dicts[sentence_index]["words"] = original_text
                continue

            for _ in range(verb_count):
                output = outputs[output_index]
                words = output["words"]
                tags = output['tags']
                description = self.make_srl_string(words, tags)
                return_dicts[sentence_index]["words"] = words
                return_dicts[sentence_index]["verbs"].append({
                        "verb": output["verb"],
                        "description": description,
                        "tags": tags,
                })
                output_index += 1

        return sanitize(return_dicts)

    @overrides
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        Expects JSON that looks like ``{"sentence": "..."}``
        and returns JSON that looks like

        .. code-block:: js

            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]}
        """
        instances = self._sentence_to_srl_instances(inputs)

        if not instances:
            return sanitize({"verbs": [], "words": self._tokenizer.split_words(inputs["sentence"])})

        outputs = self._model.forward_on_instances(instances)

        results = {"verbs": [], "words": outputs[0]["words"]}
        for output in outputs:
            tags = output['tags']
            description = self.make_srl_string(output["words"], tags)
            results["verbs"].append({
                    "verb": output["verb"],
                    "description": description,
                    "tags": tags,
            })

        return sanitize(results)
class BiaffineDependencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.BiaffineDependencyParser` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        # TODO(Mark) Make the language configurable and based on a model attribute.
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a dependency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.

        Returns
        -------
        A dictionary representation of the dependency tree.
        """
        return self.predict_json({"sentence" : sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        if self._dataset_reader.use_language_specific_pos: # type: ignore
            # fine-grained part of speech
            pos_tags = [token.tag_ for token in spacy_tokens]
        else:
            # coarse-grained part of speech (Universal Depdendencies format)
            pos_tags = [token.pos_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        words = outputs["words"]
        pos = outputs["pos"]
        heads = outputs["predicted_heads"]
        tags = outputs["predicted_dependencies"]
        outputs["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos)
        return sanitize(outputs)

    @overrides
    def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            words = output["words"]
            pos = output["pos"]
            heads = output["predicted_heads"]
            tags = output["predicted_dependencies"]
            output["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos)
        return sanitize(outputs)

    @staticmethod
    def _build_hierplane_tree(words: List[str],
                              heads: List[int],
                              tags: List[str],
                              pos: List[str]) -> Dict[str, Any]:
        """
        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """

        word_index_to_cumulative_indices: Dict[int, Tuple[int, int]] = {}
        cumulative_index = 0
        for i, word in enumerate(words):
            word_length = len(word) + 1
            word_index_to_cumulative_indices[i] = (cumulative_index, cumulative_index + word_length)
            cumulative_index += word_length

        def node_constuctor(index: int):
            children = []
            for next_index, child in enumerate(heads):
                if child == index + 1:
                    children.append(node_constuctor(next_index))

            # These are the icons which show up in the bottom right
            # corner of the node.
            attributes = [pos[index]]
            start, end = word_index_to_cumulative_indices[index]

            hierplane_node = {
                    "word": words[index],
                    # The type of the node - all nodes with the same
                    # type have a unified colour.
                    "nodeType": tags[index],
                    # Attributes of the node.
                    "attributes": attributes,
                    # The link between  the node and it's parent.
                    "link": tags[index],
                    "spans": [{"start": start, "end": end}]
            }
            if children:
                hierplane_node["children"] = children
            return hierplane_node
        # We are guaranteed that there is a single word pointing to
        # the root index, so we can find it just by searching for 0 in the list.
        root_index = heads.index(0)
        hierplane_tree = {
                "text": " ".join(words),
                "root": node_constuctor(root_index),
                "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                "linkToPosition": LINK_TO_POSITION
        }
        return hierplane_tree