def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) for correct_index, token in [[0, "<S>"], [2, "</S>"]]: indices = indexer.tokens_to_indices([Token(token)], Vocabulary()) indices = torch.from_numpy(numpy.array(indices["tokens"])).view( 1, 1, -1) embeddings = elmo_token_embedder(indices)["token_embedding"] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def predictions_to_labeled_instances(self, instance: Instance, outputs: Dict[str, numpy.ndarray]): new_instance = deepcopy(instance) token_field: TextField = instance["tokens"] # type: ignore mask_targets = [ Token(target_top_k[0]) for target_top_k in outputs["words"] ] new_instance.add_field( "target_ids", TextField(mask_targets, token_field._token_indexers), vocab=self._model.vocab, ) return [new_instance]
def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo_token_embedder = _ElmoCharacterEncoder(options_file, weight_file) for correct_index, token in [[0, '<S>'], [2, '</S>']]: indices = indexer.token_to_indices(Token(token), Vocabulary()) indices = Variable(torch.from_numpy(numpy.array(indices))).view(1, 1, -1) embeddings = elmo_token_embedder(indices)['token_embedding'] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like either: 1. ``{"text": "..."}`` 2. ``{"text": "...", "tokens": ["..."]}`` 3. ``{"text": "...", "tokens": ["..."], "pos_tags": ["..."]}`` The first will use the tokenizer and pos tagger within the constructor. The second will assume that only tokens are needed and are thus provided. The last is similar to the second but POS tags are provided as they are required by the classifier. """ if 'tokens' in json_dict: tokens = [Token(token) for token in json_dict['tokens']] input_dict = {'tokens': tokens} if 'pos_tags' in json_dict and \ 'pos_tags' in self._model.vocab._token_to_index: input_dict['pos_tags'] = json_dict['pos_tags'] return self._dataset_reader.text_to_instance(**input_dict) # Using the tokenizer and pos tagger from the constructor text = json_dict['text'] tokenized_text = self._tokenizer.split_words(text) tokens = [] pos_tags = [] for allen_token in tokenized_text: tokens.append(Token(allen_token.text)) if self._pos_tags: if self._fine_grained_tags: pos_tag = allen_token.tag_ else: pos_tag = allen_token.pos_ pos_tags.append(pos_tag) if 'pos_tags' in self._model.vocab._token_to_index and self._pos_tags: return self._dataset_reader.text_to_instance(tokens=tokens, pos_tags=pos_tags) else: return self._dataset_reader.text_to_instance(tokens=tokens)
def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer, tables_file: str, dataset_path: str): self.dataset_path = dataset_path self.tables_file = tables_file self.db_id = db_id self.utterance = utterance # lemma is the basic form of a word, # for example the singular form of a noun or the infinitive form of a verb, # as it is shown at the beginning of a dictionary entry tokenized_utterance = tokenizer.tokenize(utterance.lower()) # For example: if the utterance.lower() = ['biggest', 'departments'] # tokenized_utterance will be [token_from_('biggest'), token_from_('departments')] # And token_from_('biggest').text = 'biggest', token_from_('biggest').lemma_ = 'big'; # And token_from_('departments').text = 'departments', token_from_('departments').lemma_ = 'department'; # the obj Token is similar to the obj in tokenized_utterance but not the same. # And the here, we take only a part of data from original tokenized_utterance. # So the Token obj is a simplified version of the obj in tokenized_utterance self.tokenized_utterance = [ Token(text=t.text, lemma=t.lemma_) for t in tokenized_utterance ] if db_id not in SpiderDBContext.schemas: SpiderDBContext.schemas = read_dataset_schema(self.tables_file) self.schema = SpiderDBContext.schemas[db_id] self.knowledge_graph = self.get_db_knowledge_graph(db_id) entity_texts = [ self.knowledge_graph.entity_text[entity].lower() for entity in self.knowledge_graph.entities ] entity_tokens = tokenizer.batch_tokenize(entity_texts) self.entity_tokens = [[Token(text=t.text, lemma=t.lemma_) for t in et] for et in entity_tokens]
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate(iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch['elmo']['character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace("ROOT", namespace="dep_labels") none_index = vocab.add_token_to_namespace("NONE", namespace="dep_labels") indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == { "tokens1": [root_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == { "tokens-1": [none_index] }
def get_instances(self): field1 = TextField( [Token(t) for t in [u"this", u"is", u"a", u"sentence", u"."]], self.token_indexer) field2 = TextField([ Token(t) for t in [u"this", u"is", u"a", u"different", u"sentence", u"."] ], self.token_indexer) field3 = TextField( [Token(t) for t in [u"here", u"is", u"a", u"sentence", u"."]], self.token_indexer) field4 = TextField([Token(t) for t in [u"this", u"is", u"short"]], self.token_indexer) instances = [ Instance({ u"text1": field1, u"text2": field2 }), Instance({ u"text1": field3, u"text2": field4 }) ] return instances
def test_padding_lengths_are_computed_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"words_length": 5, "num_tokens": 5} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters", min_padding_length=1)}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "num_token_characters": 8} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters", min_padding_length=1), "words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "words_length": 5, "num_token_characters": 8}
def get_dataset(self): field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([ Token(t) for t in ["this", "is", "a", "different", "sentence", "."] ], self.token_indexer) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [ Instance({ "text1": field1, "text2": field2 }), Instance({ "text1": field3, "text2": field4 }) ] return Dataset(instances)
def test_tokens_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags') vocab.add_token_to_namespace('ORG', namespace='ner_tags') indexer = NerTagIndexer(namespace='ner_tags') assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == { "tokens1": [person_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == { "tokens-1": [none_index] }
def _add_transformers_vocab_if_necessary(self, vocab): """Adds the transformers vocabulary to the `vocab` Parameters ---------- vocab The transformers vocabulary will be added to this vocab """ # The AllenNLP`s PretrainedTransformerIndexer adds its specific vocabulary to the Model's vocab # when the first `tokens_to_index()` is called. That is why we trigger this here by passing on a dummy token. # Actually i am not sure why they add it to their vocab in the first place ... transformers_indexer = self.backbone.featurizer.indexer.get( TransformersFeatures.namespace) if transformers_indexer is not None: transformers_indexer.tokens_to_indices([Token("")], vocab)
def __init__(self, db_id: str, tokenizer: Tokenizer, tables_file: str, database_path: str, utterance: List[Token], bert_mode: str = "v0"): self.database_path = database_path self.tables_file = tables_file self.db_id = db_id self.tokenized_utterance = utterance if db_id not in SparcDBContext.db_schemas: SparcDBContext.db_schemas, SparcDBContext.db_schemas_id_col, SparcDBContext.db_schemas_id_tab \ = read_dataset_schema(self.tables_file) self.schema = SparcDBContext.db_schemas[db_id] # get id to column/table self.id_to_col = SparcDBContext.db_schemas_id_col[db_id] self.id_to_tab = SparcDBContext.db_schemas_id_tab[db_id] self.bert_mode = bert_mode self.knowledge_graph = self.get_db_knowledge_graph(db_id) entity_texts = [self.knowledge_graph.entity_text[entity].lower() for entity in self.knowledge_graph.entities] entity_tokens = tokenizer.batch_tokenize(entity_texts) self.entity_tokens = [[Token(text=t.text, lemma_=t.lemma_) if t.lemma_ != '-PRON-' else Token(text=t.text, lemma_=t.text) for t in et] for et in entity_tokens]
def text_to_instance(self, text: str, target_matrix_start: np.array, target_matrix_end: np.array) -> Instance: "训练的时候,输入这些用于训练我们的模型。至于验证时,则应重新写一个验证数据读取类" if self.pretrained_tokenizer is not None: tokens = get_word_from_pretrained(self.pretrained_tokenizer, text) else: tokens = [Token(w) for w in text] text_field = TextField(tokens, self._token_indexers) fields = { "tokens": text_field, "target_start": ArrayField(target_matrix_start), "target_end": ArrayField(target_matrix_end), "metadata": MetadataField(None) # 训练的时候,不需要知道这个。而验证集需要,故占此位置 } return Instance(fields)
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {u'character_ids': indexer}) instance = Instance({u'elmo': field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
def test_tokens_to_indices_uses_ner_tags(self): tokens = self.tokenizer.tokenize("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace("PERSON", namespace="ner_tags") none_index = vocab.add_token_to_namespace("NONE", namespace="ner_tags") vocab.add_token_to_namespace("ORG", namespace="ner_tags") indexer = NerTagIndexer(namespace="ner_tags") assert indexer.tokens_to_indices([tokens[1]], vocab) == { "tokens": [person_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab) == { "tokens": [none_index] }
def batch_to_ids(batch): """ Given a batch (as list of tokenized sentences), return a batch of padded character ids. """ instances = [] for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def test_equality(self): index_field1 = IndexField(4, self.text) index_field2 = IndexField(4, self.text) index_field3 = IndexField( 4, TextField( [Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]], {"words": SingleIdTokenIndexer("words")})) assert index_field1 == 4 assert index_field1 == index_field1 assert index_field1 == index_field2 assert index_field1 != index_field3 assert index_field2 != index_field3 assert index_field3 == index_field3
def text_to_instance(self, abstract: str, labels: List[str] = None, title: str = None): # 以字为单位 abstract = [Token(w) for w in abstract] abstract_field = TextField(abstract, self._token_indexers) meta_field = MetadataField(abstract) fields = {'abstract': abstract_field, 'metadata': meta_field} if labels: labels_field = SequenceLabelField(labels, abstract_field) t = {'labels': labels_field} fields.update(t) return Instance(fields)
def test_blank_pos_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no POS tag" # we convert it to "NONE" assert counter["pos_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index("NONE", "pos_tokens") # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab) assert { "tokens": [none_index, none_index, none_index, none_index] } == indices
def test_blank_ner_tag(self): tokens = [ Token(token)._replace(ent_type_="") for token in "allennlp is awesome .".split(" ") ] indexer = NerTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no NER tag" # we convert it to "NONE" assert counter["ner_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index("NONE", "ner_tokens") # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner") assert {"ner": [none_index, none_index, none_index, none_index]} == indices
def _read(self, file_path: str) -> Iterable[Instance]: with open(file_path, 'rb') as stream: dataset, dicts = pickle.load(stream) query_idx2text = {idx:text for text, idx in dicts["token_ids"].items()} intent_idx2text = {idx:text for text,idx in dicts["intent_ids"].items()} entities_idx2text = {idx: text for text, idx in dicts["slot_ids"].items()} for index in range(dataset["query"].__len__()): query = dataset["query"][index] query = [Token(text = query_idx2text[idx]) for idx in query] entities = [entities_idx2text[idx] for idx in dataset["slot_labels"][index]] intent = intent_idx2text[dataset["intent_labels"][index][0]] yield self.text_to_instance(query,entities,intent)
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer, 'tokens': indexer2}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def test_max_vocab_size_partial_dict(self): indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer(min_padding_length=3)} instance = Instance({ 'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers) }) dataset = Batch([instance]) params = Params({ "max_vocab_size": { "tokens": 1 } }) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
def predictions_to_labeled_instances(self, instance: Instance, outputs: Dict[str, numpy.ndarray]): new_instance = instance.duplicate() token_field: TextField = instance["tokens"] # type: ignore mask_targets = [ Token(target_top_k_text[0], text_id=target_top_id_id) for (target_top_k_text, target_top_id_id ) in zip(outputs["words"], outputs["token_ids"]) ] new_instance.add_field( "target_ids", TextField(mask_targets, token_field._token_indexers), vocab=self._model.vocab, ) return [new_instance]
def text_to_instance(self, tokenized_sentence: List[str], spans: List[List[int]]) -> Instance: allennlp_sentence_tokens = [Token(text=t) for t in tokenized_sentence] sentence_token_indexes = TextField(allennlp_sentence_tokens, self._token_indexers) span_fields = [] for span_start, span_end_exclusive in spans: span_field = SpanField(span_start, span_end_exclusive - 1, sentence_token_indexes) span_fields.append(span_field) fields: Dict[str, Field] = {} fields["tokens"] = sentence_token_indexes fields["spans"] = ListField(span_fields) return Instance(fields)
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. loader = SimpleDataLoader(instances, 3) loader.index_with(vocab) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm( batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] assert lengths.tolist() == expected_lengths # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): assert numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, )
def test_list_of_text_padding(self): from allennlp.data.token_indexers import PretrainedTransformerIndexer from allennlp.data.tokenizers import Token from allennlp.data.fields import ( TextField, ListField, ) from allennlp.data import Vocabulary word_indexer = { "tokens": PretrainedTransformerIndexer("albert-base-v2") } text_field = TextField( [ Token(t, text_id=2, type_id=1) for t in ["▁allen", "n", "lp", "▁has", "▁no", "▁bugs", "."] ], word_indexer, ) list_field = ListField([text_field]) vocab = Vocabulary() list_field.index(vocab) padding_lengths = { "list_tokens___mask": 10, "list_tokens___token_ids": 10, "list_tokens___type_ids": 10, "num_fields": 2, } tensors = list_field.as_tensor(padding_lengths)["tokens"] assert tensors["mask"].size() == (2, 10) assert tensors["mask"][0, 0] == True # noqa: E712 assert tensors["mask"][0, 9] == False # noqa: E712 assert (tensors["mask"][1, :] == False).all() # noqa: E712 assert tensors["token_ids"].size() == (2, 10) assert tensors["token_ids"][0, 0] == 2 assert tensors["token_ids"][0, 9] == 0 assert (tensors["token_ids"][1, :] == 0).all() assert tensors["type_ids"].size() == (2, 10) assert tensors["type_ids"][0, 0] == 1 assert tensors["type_ids"][0, 9] == 0 assert (tensors["type_ids"][1, :] == 0).all()
def test_text_to_instance(self): dbo_classes = set([ dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper() ]) binary_predicates = set(self.predicates) - dbo_classes reader = LCQuADReader(executor=self.executor, predicates=self.predicates) doc = self.dataset[0] # print(doc["logical_form"]) # print([entity['uri'] for entity in doc['entities']]) # print(doc.get('predicate_candidates', self.predicates)) blah = reader.text_to_instance( [Token(x) for x in reader.tokenizer(doc["question"])], doc["logical_form"], [entity['uri'] for entity in doc['entities']], doc.get('predicate_candidates', self.predicates)) print(blah)
def test_remap(self): bert_fixture = get_bert_test_fixture() indexer = bert_fixture['indexer'] tokens = [Token(t) for t in 'The words dog overst .'.split()] vocab = Vocabulary() indexed = indexer.tokens_to_indices(tokens, vocab, 'wordpiece') original_span_indices = [[0, 0], [0, 1], [2, 3], [3, 3], [2, 4]] offsets = indexed['wordpiece-offsets'] expected_remapped = [[1, 1], [1, 2], [3, 5], [4, 5], [3, 6]] remapped = remap_span_indices_after_subword_tokenization( original_span_indices, offsets, len(indexed['wordpiece'])) self.assertEqual(expected_remapped, remapped)