def test_multilabel_field_empty_field_works(self): vocab = Vocabulary() vocab.add_token_to_namespace("label1", namespace="test_empty_labels") vocab.add_token_to_namespace("label2", namespace="test_empty_labels") f = MultiLabelField([], label_namespace="test_empty_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0])) g = f.empty_field() g.index(vocab) tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0])) h = MultiLabelField([0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True) tensor = h.empty_field().as_tensor(None).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))
def test_token_type_ids(self): tokenizer = SpacyTokenizer() sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) # 2 15 10 11 6 17 2 15 10 11 6 # the laziest fox [SEP] the laziest fox tokens = ( tokens + [Token("[SEP]")] + tokens ) # have to do this b/c tokenizer splits `[SEP]` in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # [CLS] 2, 15, 10, 11, 6, 17, 2 15, 10, 11, 6, [SEP] assert indexed_tokens["token_type_ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
def main(): infilename = 'test/fixtures/bioul_to_span.json' with open(infilename) as f: d = json.load(f) docs = d['tag'] vocab = Vocabulary() vocab.add_token_to_namespace( 'O', namespace='span_labels') # reserved label for no-entity for doc in docs: for label in doc: if label != 'O': span_label = label[ 2:] # drop the first two character because they are not useful for span labels vocab.add_token_to_namespace( span_label, namespace='span_labels' ) # TODO: is this the right namespace? # this function is expecting the vocab is already initialized with span labels batched_bioul_to_span_tesnors(docs, vocab)
def test_max_length(self): config = BertConfig(len(self.token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the " * 1000 tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] embedder(tokens["bert"], tokens["bert-offsets"])
def load_elmo_model(): elmo_embedders = ElmoTokenEmbedder(OPTION_FILE, WEIGHT_FILE) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedders}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_DIM, bidirectional=True, batch_first=True)) vocabulary = Vocabulary() model = BaseModel(word_embeddings=word_embeddings, encoder=encoder, vocabulary=vocabulary) output_elmo_model_file = os.path.join(PRETRAINED_ELMO, "lstm_elmo_model.bin") model.load_state_dict(torch.load(output_elmo_model_file)) return model
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / "vocab_save" vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_tokens_to_namespace( ["a0", "a1", "a2"], namespace="a") # non-padded, should start at 0 vocab.add_tokens_to_namespace( ["b2", "b3"], namespace="b") # padded, should start at 2 vocab.save_to_files(vocab_dir) params = Params({"type": "from_files", "directory": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary( "a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary( "b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), instances=self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == { 0: "@@PADDING@@", 1: "@@UNKNOWN@@", 2: "a", 3: "c", 4: "b", } # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params( Params({ "type": "from_files", "directory": vocab_dir, "min_count": { "tokens": 2 } }))
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace( "a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace( "b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary( "a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary( "b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == { 0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b' } # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'vocabulary_directory' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params( Params({ "directory_path": vocab_dir, "min_count": 2 }))
def test_truncate_window_dont_split_wordpieces(self): """ Tests if the sentence is not truncated inside of the word with 2 or more wordpieces. """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the quickest dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=12) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17] # We could fit one more piece here, but we don't, not to have a cut # in the middle of the word assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9] assert indexed_tokens["bert-type-ids"] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=12) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17] # We could fit one more piece here, but we don't, not to have a cut # in the middle of the word assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
def test(): from pprint import pprint params = Params( {'token_embedder': { 'num_embeddings': 4, 'embedding_dim': 3 }}) vocab = Vocabulary() while True: vocab_size = vocab.get_vocab_size() if vocab_size == 4: break vocab.add_token_to_namespace('a' + str(vocab_size)) model = BaselineModel(params=params, vocab=vocab) premise = {'tokens': torch.randint(low=0, high=4, size=(5, 6))} hypothesis = {'tokens': torch.randint(low=0, high=4, size=(5, 7))} label = torch.randint(low=0, high=3, size=(5, )) output = model(premise=premise, hypothesis=hypothesis, label=label) pprint(output) pprint(model.get_metrics())
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [[16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17]] assert tokens["bert-offsets"].tolist() == [[1, 2, 3, 4, 5, 6, 7, 8, 9]]
def create_target_weight(): vocab = Vocabulary().from_files("data/vocabulary") token_weight_list = [] for index, token in vocab.get_index_to_token_vocabulary().items(): token_weight = get_target_distribution(token, vocab) token_weight_list.append(token_weight) weight = torch.stack(token_weight_list) s = Score.score torch.save( weight, "data/targets/target_{}{}{}{}{}{}.th".format( s["token_name"], s["key_name"], s["key_number"], s["triad_form"], s["figbass"], s["note_pair"], ), )
def test_get_embedding_layer_uses_correct_embedding_dim(self): vocab = Vocabulary() embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) embedding_layer = get_pretrained_embedding_layer( embeddings_filename, vocab) assert embedding_layer.get_output_dim() == 3 with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0 3.1\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0 -1.2\n".encode('utf-8')) embedding_layer = get_pretrained_embedding_layer( embeddings_filename, vocab) assert embedding_layer.get_output_dim() == 4 embedding_layer = get_pretrained_embedding_layer(embeddings_filename, vocab, projection_dim=2) assert embedding_layer.get_output_dim() == 2
def __init__(self, split, only_use_relevant_dets=True, add_image_as_a_box=True, embs_to_load='bert_da', conditioned_answer_choice=0): """ :param split: train, val, or test :param mode: answer or rationale :param only_use_relevant_dets: True, if we will only use the detections mentioned in the question and answer. False, if we should use all detections. :param add_image_as_a_box: True to add the image in as an additional 'detection'. It'll go first in the list of objects. :param embs_to_load: Which precomputed embeddings to load. :param conditioned_answer_choice: If you're in test mode, the answer labels aren't provided, which could be a problem for the QA->R task. Pass in 'conditioned_answer_choice=i' to always condition on the i-th answer. 这是啥意思??????????怎么test的时候还有这种操作?????????解释→ https://groups.google.com/forum/?hl=en#!topic/visualcommonsense/lxEgFYRz5ho """ if split not in ('test', 'train', 'val'): raise ValueError("Mode must be in test, train, or val. Supplied {}".format('answer-rationale')) print("Loading {} embeddings".format(split), flush=True) self.split = split self.only_use_relevant_dets = only_use_relevant_dets print("Only relevant dets" if only_use_relevant_dets else "Using all detections", flush=True) self.add_image_as_a_box = add_image_as_a_box self.conditioned_answer_choice = conditioned_answer_choice with open(os.path.join(VCR_ANNOTS_DIR, split, '{}.jsonl'.format(split)), 'r') as f: self.items = np.array(list(f)) self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()} self.vocab = Vocabulary() with open(os.path.join(VCR_ANNOTS_DIR, 'dataloaders', 'cocoontology.json'), 'r') as f: coco = json.load(f) self.coco_objects = ['__background__'] + [x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0]))] # 这里提到了background,思考一下以后如何利用background self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)} self.embs_to_load = embs_to_load self.h5fn_answer = os.path.join(VCR_ANNOTS_DIR, self.split, f'{self.embs_to_load}_answer_{self.split}.h5') self.h5fn_rationale = os.path.join(VCR_ANNOTS_DIR, self.split, f'{self.embs_to_load}_rationale_{self.split}.h5') self.h5fn_image = os.path.join(VCR_ANNOTS_DIR, self.split, f'attribute_features_{self.split}.h5')
def word_embeddings(self): words = re.split(r'\W+',self.text) Text = ' '.join(words) tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()) tokens = tokenizer.tokenize(Text) vocab = Vocabulary() token_indexer = PretrainedBertIndexer('bert-base-uncased') instance = Instance({"tokens":TextField(tokens,{'bert':token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lenghts = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lenghts) Tokens = tensor_dict["tokens"] model = PretrainedBertEmbedder('bert-base-uncased') bert_vectors = model(Tokens["bert"]) return(bert_vectors)
def test_set_from_file_reads_non_padded_files(self): vocab_filename = self.TEST_DIR / "vocab_file" with codecs.open(vocab_filename, "w", "utf-8") as vocab_file: vocab_file.write("B-PERS\n") vocab_file.write("I-PERS\n") vocab_file.write("O\n") vocab_file.write("B-ORG\n") vocab_file.write("I-ORG\n") vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=False, namespace="tags") assert vocab.get_token_index("B-PERS", namespace="tags") == 0 assert vocab.get_token_index("I-PERS", namespace="tags") == 1 assert vocab.get_token_index("O", namespace="tags") == 2 assert vocab.get_token_index("B-ORG", namespace="tags") == 3 assert vocab.get_token_index("I-ORG", namespace="tags") == 4 assert vocab.get_token_from_index(0, namespace="tags") == "B-PERS" assert vocab.get_token_from_index(1, namespace="tags") == "I-PERS" assert vocab.get_token_from_index(2, namespace="tags") == "O" assert vocab.get_token_from_index(3, namespace="tags") == "B-ORG" assert vocab.get_token_from_index(4, namespace="tags") == "I-ORG"
def __init__( self, span_encoder: Seq2SeqEncoder, input_dropout: float = 0.3, class_embs: bool = True, initializer: InitializerApplicator = InitializerApplicator(), learned_omcs: dict = {}, ): vocab = Vocabulary() super(KeyValueAttention, self).__init__(vocab) self.trunk = KeyValueAttentionTrunk( span_encoder, input_dropout, class_embs, initializer, learned_omcs, ) self._accuracy = BooleanAccuracy() self._loss = torch.nn.NLLLoss() initializer(self)
def test_as_array_produces_token_array(self): indexer = SpacyTokenIndexer() nlp = get_spacy_model("en_core_web_sm", pos_tags=True, parse=False, ner=False) tokens = [t for t in nlp("This is a sentence.")] field = TextField(tokens, token_indexers={"spacy": indexer}) vocab = Vocabulary() field.index(vocab) # Indexer functionality array_dict = indexer.tokens_to_indices(tokens, vocab, "spacy") assert len(array_dict["spacy"]) == 5 assert len(array_dict["spacy"][0]) == 96 # Check it also works with field lengths = field.get_padding_lengths() array_dict = field.as_tensor(lengths) assert list(array_dict["spacy"].shape) == [5, 96]
def get_predictions(abert, reader, device): """ Generates predictions from a trained model on a reader """ dev = reader.read('raw_data/drop/drop_dataset_dev.json') iterator = BasicIterator(batch_size=1) iterator.index_with(Vocabulary()) dev_iter = iterator(dev, num_epochs=1) dev_batches = [batch for batch in dev_iter] dev_batches = move_to_device(dev_batches, device) predictions = {} with torch.no_grad(): for batch in tqdm(dev_batches): out = abert(**batch) assert len(out['question_id']) == 1 assert len(out['answer']) == 1 query_id = out['question_id'][0] prediction = out['answer'][0]['value'] predictions[query_id] = prediction torch.cuda.empty_cache() return predictions
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == ["a", "c"] # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def eval_ner(learner, id2label, is_test=False): # set up AllenNLP evaluation metric mode = 'Test' if is_test else 'Validation' id2label = [f'B-{l}' if l in [PAD, BOS_LABEL] else l for l in id2label] namespace = 'ner_labels' label_vocab = Vocabulary(non_padded_namespaces=(namespace, ), tokens_to_add={namespace: id2label }) # create the tag vocabulary f1_metric = SpanBasedF1Measure(label_vocab, tag_namespace=namespace, ignore_classes=[PAD, BOS_LABEL]) preds, y = learner.predict_with_targs(is_test=is_test) # convert to tensors, add a batch dimension preds_tensor = torch.from_numpy(preds).unsqueeze(0) y_tensor = torch.from_numpy(y).unsqueeze(0) f1_metric(preds_tensor, y_tensor) all_metrics = f1_metric.get_metric(reset=True) print(f'{mode} f1 measure overall:', all_metrics['f1-measure-overall']) print(all_metrics) preds_fwd_ids = [np.argmax(p) for p in preds] acc_fwd = accuracy_score(y, preds_fwd_ids) print(f'{mode} token-level accuracy of NER model: %.4f.' % acc_fwd)
def test_unlabeled(): from pprint import pprint params = Params({ 'token_embedder': { 'num_embeddings': 4, 'embedding_dim': 300 }, 'code_dist_type': 'gaussian' }) vocab = Vocabulary() while True: vocab_size = vocab.get_vocab_size() if vocab_size == 4: break vocab.add_token_to_namespace('a' + str(vocab_size)) model = DeconvSNLIModel(params=params, vocab=vocab) premise = {'tokens': torch.randint(low=0, high=4, size=(5, 29))} hypothesis = {'tokens': torch.randint(low=0, high=4, size=(5, 29))} output = model(premise=premise, hypothesis=hypothesis, label=None) pprint(output) pprint(model.get_metrics())
def test_token_type_ids(self): tokenizer = WordTokenizer() sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) # 2 15 10 11 6 17 2 15 10 11 6 # the laziest fox [SEP] the laziest fox tokens = tokens + [ Token("[SEP]") ] + tokens # have to do this b/c tokenizer splits `[SEP]` in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # [CLS] 2, 15, 10, 11, 6, 17, 2 15, 10, 11, 6, [SEP] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1 ] #pylint: disable=bad-whitespace
def test_sliding_window_with_batch(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer( str(vocab_path), truncate_long_sequences=False, max_pieces=8 ) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})} ) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # Testing without token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert bert_vectors is not None # Testing with token_type_ids bert_vectors = token_embedder( tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens["bert-type-ids"] ) assert bert_vectors is not None
def test_set_from_file_reads_non_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR + 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('B-PERS\n') vocab_file.write('I-PERS\n') vocab_file.write('O\n') vocab_file.write('B-ORG\n') vocab_file.write('I-ORG\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags') assert vocab.get_token_index("B-PERS", namespace='tags') == 0 assert vocab.get_token_index("I-PERS", namespace='tags') == 1 assert vocab.get_token_index("O", namespace='tags') == 2 assert vocab.get_token_index("B-ORG", namespace='tags') == 3 assert vocab.get_token_index("I-ORG", namespace='tags') == 4 assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS" assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS" assert vocab.get_token_from_index(2, namespace='tags') == "O" assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG" assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField([Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"type": "extend", "directory": vocab_dir}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory` key must be present in params, # or else there is nothing to extend from. params = Params({"type": "extend"}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
def test_saving_and_loading(self): vocab_dir = self.TEST_DIR / "vocab_save" vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_tokens_to_namespace( ["a0", "a1", "a2"], namespace="a" ) # non-padded, should start at 0 vocab.add_tokens_to_namespace(["b2", "b3"], namespace="b") # padded, should start at 2 vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace="a") == 3 assert vocab2.get_token_from_index(0, namespace="a") == "a0" assert vocab2.get_token_from_index(1, namespace="a") == "a1" assert vocab2.get_token_from_index(2, namespace="a") == "a2" assert vocab2.get_token_index("a0", namespace="a") == 0 assert vocab2.get_token_index("a1", namespace="a") == 1 assert vocab2.get_token_index("a2", namespace="a") == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace="b") == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace="b") == vocab._padding_token assert vocab2.get_token_from_index(1, namespace="b") == vocab._oov_token assert vocab2.get_token_from_index(2, namespace="b") == "b2" assert vocab2.get_token_from_index(3, namespace="b") == "b3" assert vocab2.get_token_index(vocab._padding_token, namespace="b") == 0 assert vocab2.get_token_index(vocab._oov_token, namespace="b") == 1 assert vocab2.get_token_index("b2", namespace="b") == 2 assert vocab2.get_token_index("b3", namespace="b") == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def test_starting_ending_offsets(self): tokenizer = BertPreTokenizer() # 2 3 5 6 8 9 2 15 10 11 14 1 sentence = "the quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # 16 = [CLS], 17 = [SEP] assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17] assert indexed_tokens["offsets"] == [1, 2, 3, 4, 5, 6, 7, 10, 11, 12] token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17] assert indexed_tokens["offsets"] == [1, 2, 3, 4, 5, 6, 7, 8, 11, 12]
def main(weights_file, device): tokenizer = BertDropTokenizer('bert-base-uncased') token_indexer = BertDropTokenIndexer('bert-base-uncased') reader = BertDropReader(tokenizer, {'tokens': token_indexer}, extra_numbers=[100, 1], exp_search='template') abert = NumericallyAugmentedBERTT(Vocabulary(), 'bert-base-uncased', special_numbers=[100, 1]) abert.load_state_dict(torch.load(weights_file, map_location='cpu')) abert.to(device).eval() predictions = get_predictions(abert, reader, device) # Write out predictions to file serialization_dir = '/'.join(weights_file.split('/')[:-1]) predictions_file = weights_file.split('/')[-1].split( '.')[0] + '_dev_pred.json' predictions_file = join(serialization_dir, predictions_file) with open(predictions_file, "w") as writer: writer.write(json.dumps(predictions, indent=4) + "\n")
def test_starting_ending_offsets(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence = "the quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] assert indexed_tokens["bert"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17] assert indexed_tokens["bert-offsets"] == [1, 2, 3, 4, 5, 6, 7, 10, 11, 12] token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") assert indexed_tokens["bert"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17] assert indexed_tokens["bert-offsets"] == [1, 2, 3, 4, 5, 6, 7, 8, 11, 12]