Пример #1
0
    def test_elmo_token_representation_bos_eos(self):
        # The additional <S> and </S> embeddings added by the embedder should be as expected.
        indexer = ELMoTokenCharactersIndexer()

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file,
                                                    self.weight_file)

        for correct_index, token in [[0, "<S>"], [2, "</S>"]]:
            indices = indexer.tokens_to_indices([Token(token)], Vocabulary())
            indices = torch.from_numpy(numpy.array(indices["tokens"])).view(
                1, 1, -1)
            embeddings = elmo_token_embedder(indices)["token_embedding"]
            assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(),
                                  embeddings[0, 1, :].data.numpy())
Пример #2
0
    def predictions_to_labeled_instances(self, instance: Instance,
                                         outputs: Dict[str, numpy.ndarray]):
        new_instance = deepcopy(instance)
        token_field: TextField = instance["tokens"]  # type: ignore
        mask_targets = [
            Token(target_top_k[0]) for target_top_k in outputs["words"]
        ]

        new_instance.add_field(
            "target_ids",
            TextField(mask_targets, token_field._token_indexers),
            vocab=self._model.vocab,
        )
        return [new_instance]
Пример #3
0
    def test_elmo_token_representation_bos_eos(self):
        # The additional <S> and </S> embeddings added by the embedder should be as expected.
        indexer = ELMoTokenCharactersIndexer()

        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')

        elmo_token_embedder = _ElmoCharacterEncoder(options_file, weight_file)

        for correct_index, token in [[0, '<S>'], [2, '</S>']]:
            indices = indexer.token_to_indices(Token(token), Vocabulary())
            indices = Variable(torch.from_numpy(numpy.array(indices))).view(1, 1, -1)
            embeddings = elmo_token_embedder(indices)['token_embedding']
            assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
Пример #4
0
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like either:
        1. ``{"text": "..."}``
        2. ``{"text": "...", "tokens": ["..."]}``
        3. ``{"text": "...", "tokens": ["..."], "pos_tags": ["..."]}``
        The first will use the tokenizer and pos tagger within the constructor.
        The second will assume that only tokens are needed and are thus 
        provided. The last is similar to the second but POS tags are provided 
        as they are required by the classifier.
        """
        if 'tokens' in json_dict:
            tokens = [Token(token) for token in json_dict['tokens']]
            input_dict = {'tokens': tokens}
            if 'pos_tags' in json_dict and \
               'pos_tags' in self._model.vocab._token_to_index:
                input_dict['pos_tags'] = json_dict['pos_tags']
            return self._dataset_reader.text_to_instance(**input_dict)
        # Using the tokenizer and pos tagger from the constructor
        text = json_dict['text']
        tokenized_text = self._tokenizer.split_words(text)
        tokens = []
        pos_tags = []
        for allen_token in tokenized_text:
            tokens.append(Token(allen_token.text))

            if self._pos_tags:
                if self._fine_grained_tags:
                    pos_tag = allen_token.tag_
                else:
                    pos_tag = allen_token.pos_
                pos_tags.append(pos_tag)
        if 'pos_tags' in self._model.vocab._token_to_index and self._pos_tags:
            return self._dataset_reader.text_to_instance(tokens=tokens,
                                                         pos_tags=pos_tags)
        else:
            return self._dataset_reader.text_to_instance(tokens=tokens)
Пример #5
0
    def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer,
                 tables_file: str, dataset_path: str):
        self.dataset_path = dataset_path
        self.tables_file = tables_file
        self.db_id = db_id
        self.utterance = utterance

        # lemma is the basic form of a word,
        # for example the singular form of a noun or the infinitive form of a verb,
        # as it is shown at the beginning of a dictionary entry
        tokenized_utterance = tokenizer.tokenize(utterance.lower())

        # For example: if the utterance.lower() = ['biggest', 'departments']
        # tokenized_utterance will be [token_from_('biggest'), token_from_('departments')]
        # And token_from_('biggest').text = 'biggest', token_from_('biggest').lemma_ = 'big';
        # And token_from_('departments').text = 'departments', token_from_('departments').lemma_ = 'department';

        # the obj Token is similar to the obj in tokenized_utterance but not the same.
        # And the here, we take only a part of data from original tokenized_utterance.
        # So the Token obj is a simplified version of the obj in tokenized_utterance
        self.tokenized_utterance = [
            Token(text=t.text, lemma=t.lemma_) for t in tokenized_utterance
        ]

        if db_id not in SpiderDBContext.schemas:
            SpiderDBContext.schemas = read_dataset_schema(self.tables_file)
        self.schema = SpiderDBContext.schemas[db_id]

        self.knowledge_graph = self.get_db_knowledge_graph(db_id)

        entity_texts = [
            self.knowledge_graph.entity_text[entity].lower()
            for entity in self.knowledge_graph.entities
        ]
        entity_tokens = tokenizer.batch_tokenize(entity_texts)
        self.entity_tokens = [[Token(text=t.text, lemma=t.lemma_) for t in et]
                              for et in entity_tokens]
Пример #6
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch['elmo']['character_ids'])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                    lm_embeddings['activations'][2],
                    lm_embeddings['mask']
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                    len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                        numpy.allclose(
                                top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                                expected_top_layer[k],
                                atol=1.0e-6
                        )
                )
 def test_tokens_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace("ROOT",
                                               namespace="dep_labels")
     none_index = vocab.add_token_to_namespace("NONE",
                                               namespace="dep_labels")
     indexer = DepLabelIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {
         "tokens1": [root_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {
         "tokens-1": [none_index]
     }
Пример #8
0
 def get_instances(self):
     field1 = TextField(
         [Token(t) for t in [u"this", u"is", u"a", u"sentence", u"."]],
         self.token_indexer)
     field2 = TextField([
         Token(t)
         for t in [u"this", u"is", u"a", u"different", u"sentence", u"."]
     ], self.token_indexer)
     field3 = TextField(
         [Token(t) for t in [u"here", u"is", u"a", u"sentence", u"."]],
         self.token_indexer)
     field4 = TextField([Token(t) for t in [u"this", u"is", u"short"]],
                        self.token_indexer)
     instances = [
         Instance({
             u"text1": field1,
             u"text2": field2
         }),
         Instance({
             u"text1": field3,
             u"text2": field4
         })
     ]
     return instances
Пример #9
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters",
                                                                               min_padding_length=1)})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "num_token_characters": 8}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters",
                                                                               min_padding_length=1),
                                          "words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5,
                                   "characters_length": 5,
                                   "words_length": 5,
                                   "num_token_characters": 8}
Пример #10
0
 def get_dataset(self):
     field1 = TextField(
         [Token(t) for t in ["this", "is", "a", "sentence", "."]],
         self.token_indexer)
     field2 = TextField([
         Token(t)
         for t in ["this", "is", "a", "different", "sentence", "."]
     ], self.token_indexer)
     field3 = TextField(
         [Token(t) for t in ["here", "is", "a", "sentence", "."]],
         self.token_indexer)
     field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                        self.token_indexer)
     instances = [
         Instance({
             "text1": field1,
             "text2": field2
         }),
         Instance({
             "text1": field3,
             "text2": field4
         })
     ]
     return Dataset(instances)
Пример #11
0
 def test_tokens_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace('PERSON',
                                                 namespace='ner_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
     vocab.add_token_to_namespace('ORG', namespace='ner_tags')
     indexer = NerTagIndexer(namespace='ner_tags')
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {
         "tokens1": [person_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {
         "tokens-1": [none_index]
     }
Пример #12
0
    def _add_transformers_vocab_if_necessary(self, vocab):
        """Adds the transformers vocabulary to the `vocab`

        Parameters
        ----------
        vocab
            The transformers vocabulary will be added to this vocab
        """
        # The AllenNLP`s PretrainedTransformerIndexer adds its specific vocabulary to the Model's vocab
        # when the first `tokens_to_index()` is called. That is why we trigger this here by passing on a dummy token.
        # Actually i am not sure why they add it to their vocab in the first place ...
        transformers_indexer = self.backbone.featurizer.indexer.get(
            TransformersFeatures.namespace)
        if transformers_indexer is not None:
            transformers_indexer.tokens_to_indices([Token("")], vocab)
Пример #13
0
    def __init__(self, db_id: str, tokenizer: Tokenizer, tables_file: str,
                 database_path: str, utterance: List[Token], bert_mode: str = "v0"):
        self.database_path = database_path
        self.tables_file = tables_file
        self.db_id = db_id
        self.tokenized_utterance = utterance

        if db_id not in SparcDBContext.db_schemas:
            SparcDBContext.db_schemas, SparcDBContext.db_schemas_id_col, SparcDBContext.db_schemas_id_tab \
                = read_dataset_schema(self.tables_file)
        self.schema = SparcDBContext.db_schemas[db_id]
        # get id to column/table
        self.id_to_col = SparcDBContext.db_schemas_id_col[db_id]
        self.id_to_tab = SparcDBContext.db_schemas_id_tab[db_id]

        self.bert_mode = bert_mode
        self.knowledge_graph = self.get_db_knowledge_graph(db_id)

        entity_texts = [self.knowledge_graph.entity_text[entity].lower()
                        for entity in self.knowledge_graph.entities]
        entity_tokens = tokenizer.batch_tokenize(entity_texts)

        self.entity_tokens = [[Token(text=t.text, lemma_=t.lemma_) if t.lemma_ != '-PRON-'
                               else Token(text=t.text, lemma_=t.text) for t in et] for et in entity_tokens]
Пример #14
0
    def text_to_instance(self, text: str, target_matrix_start: np.array, target_matrix_end: np.array) -> Instance:
        "训练的时候,输入这些用于训练我们的模型。至于验证时,则应重新写一个验证数据读取类"
        if self.pretrained_tokenizer is not None:
            tokens = get_word_from_pretrained(self.pretrained_tokenizer, text)
        else:
            tokens = [Token(w) for w in text]
        text_field = TextField(tokens, self._token_indexers)

        fields = {
            "tokens": text_field,
            "target_start": ArrayField(target_matrix_start),
            "target_end": ArrayField(target_matrix_end),
            "metadata": MetadataField(None) # 训练的时候,不需要知道这个。而验证集需要,故占此位置
        }
        return Instance(fields)
Пример #15
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {u'character_ids': indexer})
            instance = Instance({u'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
Пример #16
0
 def test_tokens_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.tokenize("Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace("PERSON",
                                                 namespace="ner_tags")
     none_index = vocab.add_token_to_namespace("NONE", namespace="ner_tags")
     vocab.add_token_to_namespace("ORG", namespace="ner_tags")
     indexer = NerTagIndexer(namespace="ner_tags")
     assert indexer.tokens_to_indices([tokens[1]], vocab) == {
         "tokens": [person_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab) == {
         "tokens": [none_index]
     }
def batch_to_ids(batch):
    """
    Given a batch (as list of tokenized sentences), return a batch
    of padded character ids.
    """
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Пример #18
0
    def test_equality(self):
        index_field1 = IndexField(4, self.text)
        index_field2 = IndexField(4, self.text)
        index_field3 = IndexField(
            4,
            TextField(
                [Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]],
                {"words": SingleIdTokenIndexer("words")}))

        assert index_field1 == 4
        assert index_field1 == index_field1
        assert index_field1 == index_field2

        assert index_field1 != index_field3
        assert index_field2 != index_field3
        assert index_field3 == index_field3
    def text_to_instance(self,
                         abstract: str,
                         labels: List[str] = None,
                         title: str = None):
        # 以字为单位
        abstract = [Token(w) for w in abstract]
        abstract_field = TextField(abstract, self._token_indexers)
        meta_field = MetadataField(abstract)
        fields = {'abstract': abstract_field, 'metadata': meta_field}
        if labels:
            labels_field = SequenceLabelField(labels, abstract_field)

            t = {'labels': labels_field}
            fields.update(t)

        return Instance(fields)
Пример #20
0
 def test_blank_pos_tag(self):
     tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
     indexer = PosTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no POS tag"
     # we convert it to "NONE"
     assert counter["pos_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index("NONE", "pos_tokens")
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab)
     assert {
         "tokens": [none_index, none_index, none_index, none_index]
     } == indices
 def test_blank_ner_tag(self):
     tokens = [
         Token(token)._replace(ent_type_="") for token in "allennlp is awesome .".split(" ")
     ]
     indexer = NerTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no NER tag"
     # we convert it to "NONE"
     assert counter["ner_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index("NONE", "ner_tokens")
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
     assert {"ner": [none_index, none_index, none_index, none_index]} == indices
	def _read(self, file_path: str) -> Iterable[Instance]:
		with open(file_path, 'rb') as stream:
			dataset, dicts = pickle.load(stream)
			
			query_idx2text = {idx:text for text, idx in dicts["token_ids"].items()}
			intent_idx2text = {idx:text for text,idx in dicts["intent_ids"].items()}
			entities_idx2text = {idx: text for text, idx in dicts["slot_ids"].items()}
			
			for index in range(dataset["query"].__len__()):
				query = dataset["query"][index]
				query = [Token(text = query_idx2text[idx]) for idx in query]
				
				entities = [entities_idx2text[idx] for idx in dataset["slot_labels"][index]]
				intent = intent_idx2text[dataset["intent_labels"][index][0]]
				
				yield self.text_to_instance(query,entities,intent)
Пример #23
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {'character_ids': indexer,
                               'tokens': indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
Пример #24
0
    def test_max_vocab_size_partial_dict(self):
        indexers = {"tokens": SingleIdTokenIndexer(),
                    "token_characters": TokenCharactersIndexer(min_padding_length=3)}
        instance = Instance({
                'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers)
        })
        dataset = Batch([instance])
        params = Params({
                "max_vocab_size": {
                        "tokens": 1
                }
        })

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
Пример #25
0
    def predictions_to_labeled_instances(self, instance: Instance,
                                         outputs: Dict[str, numpy.ndarray]):
        new_instance = instance.duplicate()
        token_field: TextField = instance["tokens"]  # type: ignore
        mask_targets = [
            Token(target_top_k_text[0], text_id=target_top_id_id)
            for (target_top_k_text, target_top_id_id
                 ) in zip(outputs["words"], outputs["token_ids"])
        ]

        new_instance.add_field(
            "target_ids",
            TextField(mask_targets, token_field._token_indexers),
            vocab=self._model.vocab,
        )
        return [new_instance]
Пример #26
0
    def text_to_instance(self, tokenized_sentence: List[str],
                         spans: List[List[int]]) -> Instance:
        allennlp_sentence_tokens = [Token(text=t) for t in tokenized_sentence]
        sentence_token_indexes = TextField(allennlp_sentence_tokens,
                                           self._token_indexers)

        span_fields = []
        for span_start, span_end_exclusive in spans:
            span_field = SpanField(span_start, span_end_exclusive - 1,
                                   sentence_token_indexes)
            span_fields.append(span_field)

        fields: Dict[str, Field] = {}
        fields["tokens"] = sentence_token_indexes
        fields["spans"] = ListField(span_fields)
        return Instance(fields)
Пример #27
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        # Now finally we can iterate through batches.
        loader = SimpleDataLoader(instances, 3)
        loader.index_with(vocab)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(
                batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            assert lengths.tolist() == expected_lengths

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                assert numpy.allclose(
                    top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                    expected_top_layer[k],
                    atol=1.0e-6,
                )
Пример #28
0
    def test_list_of_text_padding(self):
        from allennlp.data.token_indexers import PretrainedTransformerIndexer
        from allennlp.data.tokenizers import Token
        from allennlp.data.fields import (
            TextField,
            ListField,
        )
        from allennlp.data import Vocabulary

        word_indexer = {
            "tokens": PretrainedTransformerIndexer("albert-base-v2")
        }
        text_field = TextField(
            [
                Token(t, text_id=2, type_id=1)
                for t in ["▁allen", "n", "lp", "▁has", "▁no", "▁bugs", "."]
            ],
            word_indexer,
        )
        list_field = ListField([text_field])

        vocab = Vocabulary()
        list_field.index(vocab)

        padding_lengths = {
            "list_tokens___mask": 10,
            "list_tokens___token_ids": 10,
            "list_tokens___type_ids": 10,
            "num_fields": 2,
        }

        tensors = list_field.as_tensor(padding_lengths)["tokens"]
        assert tensors["mask"].size() == (2, 10)
        assert tensors["mask"][0, 0] == True  # noqa: E712
        assert tensors["mask"][0, 9] == False  # noqa: E712
        assert (tensors["mask"][1, :] == False).all()  # noqa: E712

        assert tensors["token_ids"].size() == (2, 10)
        assert tensors["token_ids"][0, 0] == 2
        assert tensors["token_ids"][0, 9] == 0
        assert (tensors["token_ids"][1, :] == 0).all()

        assert tensors["type_ids"].size() == (2, 10)
        assert tensors["type_ids"][0, 0] == 1
        assert tensors["type_ids"][0, 9] == 0
        assert (tensors["type_ids"][1, :] == 0).all()
Пример #29
0
    def test_text_to_instance(self):
        dbo_classes = set([
            dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper()
        ])
        binary_predicates = set(self.predicates) - dbo_classes

        reader = LCQuADReader(executor=self.executor,
                              predicates=self.predicates)
        doc = self.dataset[0]
        # print(doc["logical_form"])
        # print([entity['uri'] for entity in doc['entities']])
        # print(doc.get('predicate_candidates', self.predicates))
        blah = reader.text_to_instance(
            [Token(x) for x in reader.tokenizer(doc["question"])],
            doc["logical_form"], [entity['uri'] for entity in doc['entities']],
            doc.get('predicate_candidates', self.predicates))
        print(blah)
Пример #30
0
    def test_remap(self):
        bert_fixture = get_bert_test_fixture()
        indexer = bert_fixture['indexer']

        tokens = [Token(t) for t in 'The words dog overst .'.split()]
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(tokens, vocab, 'wordpiece')

        original_span_indices = [[0, 0], [0, 1], [2, 3], [3, 3], [2, 4]]
        offsets = indexed['wordpiece-offsets']

        expected_remapped = [[1, 1], [1, 2], [3, 5], [4, 5], [3, 6]]

        remapped = remap_span_indices_after_subword_tokenization(
            original_span_indices, offsets, len(indexed['wordpiece']))

        self.assertEqual(expected_remapped, remapped)