Exemplo n.º 1
0
    def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
        g = f.empty_field()
        g.index(vocab)
        tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))

        h = MultiLabelField([0, 0, 1],
                            label_namespace="test_empty_labels",
                            num_labels=3,
                            skip_indexing=True)
        tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))
Exemplo n.º 2
0
    def test_token_type_ids(self):
        tokenizer = SpacyTokenizer()

        sentence = "the laziest  fox"

        tokens = tokenizer.tokenize(sentence)
        #           2   15 10 11  6   17    2   15 10 11  6
        #           the laziest   fox [SEP] the laziest   fox
        tokens = (
            tokens + [Token("[SEP]")] + tokens
        )  # have to do this b/c tokenizer splits `[SEP]` in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        #                                          [CLS] 2, 15, 10, 11, 6, 17, 2  15, 10, 11, 6, [SEP]
        assert indexed_tokens["token_type_ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
Exemplo n.º 3
0
def main():
    infilename = 'test/fixtures/bioul_to_span.json'
    with open(infilename) as f:
        d = json.load(f)

    docs = d['tag']
    vocab = Vocabulary()
    vocab.add_token_to_namespace(
        'O', namespace='span_labels')  # reserved label for no-entity
    for doc in docs:
        for label in doc:
            if label != 'O':
                span_label = label[
                    2:]  # drop the first two character because they are not useful for span labels
                vocab.add_token_to_namespace(
                    span_label, namespace='span_labels'
                )  # TODO: is this the right namespace?

    # this function is expecting the vocab is already initialized with span labels
    batched_bioul_to_span_tesnors(docs, vocab)
Exemplo n.º 4
0
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
def load_elmo_model():
    elmo_embedders = ElmoTokenEmbedder(OPTION_FILE, WEIGHT_FILE)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedders})

    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embeddings.get_output_dim(),
                      HIDDEN_DIM,
                      bidirectional=True,
                      batch_first=True))

    vocabulary = Vocabulary()

    model = BaseModel(word_embeddings=word_embeddings,
                      encoder=encoder,
                      vocabulary=vocabulary)

    output_elmo_model_file = os.path.join(PRETRAINED_ELMO,
                                          "lstm_elmo_model.bin")
    model.load_state_dict(torch.load(output_elmo_model_file))
    return model
Exemplo n.º 6
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
Exemplo n.º 7
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_tokens_to_namespace(
            ["a0", "a1", "a2"], namespace="a")  # non-padded, should start at 0
        vocab.add_tokens_to_namespace(
            ["b2", "b3"], namespace="b")  # padded, should start at 2
        vocab.save_to_files(vocab_dir)

        params = Params({"type": "from_files", "directory": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary(
            "a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary(
            "b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), instances=self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {
            0: "@@PADDING@@",
            1: "@@UNKNOWN@@",
            2: "a",
            3: "c",
            4: "b",
        }
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'directory' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(
                Params({
                    "type": "from_files",
                    "directory": vocab_dir,
                    "min_count": {
                        "tokens": 2
                    }
                }))
Exemplo n.º 8
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace(
            "a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace(
            "b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")
        vocab.save_to_files(vocab_dir)

        params = Params({"directory_path": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary(
            "a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary(
            "b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {
            0: '@@PADDING@@',
            1: '@@UNKNOWN@@',
            2: 'a',
            3: 'c',
            4: 'b'
        }
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'vocabulary_directory' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(
                Params({
                    "directory_path": vocab_dir,
                    "min_count": 2
                }))
Exemplo n.º 9
0
    def test_truncate_window_dont_split_wordpieces(self):
        """
        Tests if the sentence is not truncated inside of the word with 2 or
        more wordpieces.
        """

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the quickest dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=12)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
        # We could fit one more piece here, but we don't, not to have a cut
        # in the middle of the word
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9]
        assert indexed_tokens["bert-type-ids"] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=12)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
        # We could fit one more piece here, but we don't, not to have a cut
        # in the middle of the word
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
Exemplo n.º 10
0
def test():
    from pprint import pprint

    params = Params(
        {'token_embedder': {
            'num_embeddings': 4,
            'embedding_dim': 3
        }})
    vocab = Vocabulary()
    while True:
        vocab_size = vocab.get_vocab_size()
        if vocab_size == 4:
            break
        vocab.add_token_to_namespace('a' + str(vocab_size))
    model = BaselineModel(params=params, vocab=vocab)
    premise = {'tokens': torch.randint(low=0, high=4, size=(5, 6))}
    hypothesis = {'tokens': torch.randint(low=0, high=4, size=(5, 7))}
    label = torch.randint(low=0, high=3, size=(5, ))
    output = model(premise=premise, hypothesis=hypothesis, label=label)
    pprint(output)
    pprint(model.get_metrics())
Exemplo n.º 11
0
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [[16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17]]

        assert tokens["bert-offsets"].tolist() == [[1, 2, 3, 4, 5, 6, 7, 8, 9]]
Exemplo n.º 12
0
def create_target_weight():
    vocab = Vocabulary().from_files("data/vocabulary")

    token_weight_list = []
    for index, token in vocab.get_index_to_token_vocabulary().items():
        token_weight = get_target_distribution(token, vocab)
        token_weight_list.append(token_weight)

    weight = torch.stack(token_weight_list)
    s = Score.score
    torch.save(
        weight,
        "data/targets/target_{}{}{}{}{}{}.th".format(
            s["token_name"],
            s["key_name"],
            s["key_number"],
            s["triad_form"],
            s["figbass"],
            s["note_pair"],
        ),
    )
Exemplo n.º 13
0
    def test_get_embedding_layer_uses_correct_embedding_dim(self):
        vocab = Vocabulary()
        embeddings_filename = self.TEST_DIR + "embeddings.gz"
        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
            embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
        embedding_layer = get_pretrained_embedding_layer(
            embeddings_filename, vocab)
        assert embedding_layer.get_output_dim() == 3

        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("word1 1.0 2.3 -1.0 3.1\n".encode('utf-8'))
            embeddings_file.write("word2 0.1 0.4 -4.0 -1.2\n".encode('utf-8'))
        embedding_layer = get_pretrained_embedding_layer(
            embeddings_filename, vocab)
        assert embedding_layer.get_output_dim() == 4

        embedding_layer = get_pretrained_embedding_layer(embeddings_filename,
                                                         vocab,
                                                         projection_dim=2)
        assert embedding_layer.get_output_dim() == 2
Exemplo n.º 14
0
    def __init__(self, split, only_use_relevant_dets=True, add_image_as_a_box=True, embs_to_load='bert_da',
                 conditioned_answer_choice=0):
        """

        :param split: train, val, or test
        :param mode: answer or rationale
        :param only_use_relevant_dets: True, if we will only use the detections mentioned in the question and answer.
                                       False, if we should use all detections.
        :param add_image_as_a_box:     True to add the image in as an additional 'detection'. It'll go first in the list
                                       of objects.
        :param embs_to_load: Which precomputed embeddings to load.
        :param conditioned_answer_choice: If you're in test mode, the answer labels aren't provided, which could be
                                          a problem for the QA->R task. Pass in 'conditioned_answer_choice=i'
                                          to always condition on the i-th answer.       这是啥意思??????????怎么test的时候还有这种操作?????????解释→  https://groups.google.com/forum/?hl=en#!topic/visualcommonsense/lxEgFYRz5ho
        """
        if split not in ('test', 'train', 'val'):
            raise ValueError("Mode must be in test, train, or val. Supplied {}".format('answer-rationale'))
        print("Loading {} embeddings".format(split), flush=True)
        self.split = split
        self.only_use_relevant_dets = only_use_relevant_dets
        print("Only relevant dets" if only_use_relevant_dets else "Using all detections", flush=True)

        self.add_image_as_a_box = add_image_as_a_box
        self.conditioned_answer_choice = conditioned_answer_choice

        with open(os.path.join(VCR_ANNOTS_DIR, split, '{}.jsonl'.format(split)), 'r') as f:
            self.items = np.array(list(f))

        self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()}
        self.vocab = Vocabulary()

        with open(os.path.join(VCR_ANNOTS_DIR, 'dataloaders', 'cocoontology.json'), 'r') as f:
            coco = json.load(f)
        self.coco_objects = ['__background__'] + [x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0]))]       # 这里提到了background,思考一下以后如何利用background
        self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)}

        self.embs_to_load = embs_to_load
        self.h5fn_answer = os.path.join(VCR_ANNOTS_DIR, self.split, f'{self.embs_to_load}_answer_{self.split}.h5')
        self.h5fn_rationale = os.path.join(VCR_ANNOTS_DIR, self.split, f'{self.embs_to_load}_rationale_{self.split}.h5')
        self.h5fn_image = os.path.join(VCR_ANNOTS_DIR, self.split, f'attribute_features_{self.split}.h5')
Exemplo n.º 15
0
 def word_embeddings(self):
     words = re.split(r'\W+',self.text) 
     Text = ' '.join(words)
     
     tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter())
     
     tokens = tokenizer.tokenize(Text)
     vocab = Vocabulary()
     token_indexer = PretrainedBertIndexer('bert-base-uncased')
     
     instance = Instance({"tokens":TextField(tokens,{'bert':token_indexer})})
     batch = Batch([instance])
     batch.index_instances(vocab)
     
     padding_lenghts = batch.get_padding_lengths()
     tensor_dict = batch.as_tensor_dict(padding_lenghts)
     
     Tokens = tensor_dict["tokens"]
     
     model = PretrainedBertEmbedder('bert-base-uncased')
     bert_vectors = model(Tokens["bert"])
     return(bert_vectors)
Exemplo n.º 16
0
    def test_set_from_file_reads_non_padded_files(self):

        vocab_filename = self.TEST_DIR / "vocab_file"
        with codecs.open(vocab_filename, "w", "utf-8") as vocab_file:
            vocab_file.write("B-PERS\n")
            vocab_file.write("I-PERS\n")
            vocab_file.write("O\n")
            vocab_file.write("B-ORG\n")
            vocab_file.write("I-ORG\n")

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=False, namespace="tags")
        assert vocab.get_token_index("B-PERS", namespace="tags") == 0
        assert vocab.get_token_index("I-PERS", namespace="tags") == 1
        assert vocab.get_token_index("O", namespace="tags") == 2
        assert vocab.get_token_index("B-ORG", namespace="tags") == 3
        assert vocab.get_token_index("I-ORG", namespace="tags") == 4
        assert vocab.get_token_from_index(0, namespace="tags") == "B-PERS"
        assert vocab.get_token_from_index(1, namespace="tags") == "I-PERS"
        assert vocab.get_token_from_index(2, namespace="tags") == "O"
        assert vocab.get_token_from_index(3, namespace="tags") == "B-ORG"
        assert vocab.get_token_from_index(4, namespace="tags") == "I-ORG"
Exemplo n.º 17
0
    def __init__(
        self,
        span_encoder: Seq2SeqEncoder,
        input_dropout: float = 0.3,
        class_embs: bool = True,
        initializer: InitializerApplicator = InitializerApplicator(),
        learned_omcs: dict = {},
    ):
        vocab = Vocabulary()
        super(KeyValueAttention, self).__init__(vocab)

        self.trunk = KeyValueAttentionTrunk(
            span_encoder,
            input_dropout,
            class_embs,
            initializer,
            learned_omcs,
        )

        self._accuracy = BooleanAccuracy()
        self._loss = torch.nn.NLLLoss()
        initializer(self)
Exemplo n.º 18
0
    def test_as_array_produces_token_array(self):
        indexer = SpacyTokenIndexer()
        nlp = get_spacy_model("en_core_web_sm",
                              pos_tags=True,
                              parse=False,
                              ner=False)
        tokens = [t for t in nlp("This is a sentence.")]
        field = TextField(tokens, token_indexers={"spacy": indexer})

        vocab = Vocabulary()
        field.index(vocab)

        # Indexer functionality
        array_dict = indexer.tokens_to_indices(tokens, vocab, "spacy")
        assert len(array_dict["spacy"]) == 5
        assert len(array_dict["spacy"][0]) == 96

        # Check it also works with field
        lengths = field.get_padding_lengths()
        array_dict = field.as_tensor(lengths)

        assert list(array_dict["spacy"].shape) == [5, 96]
Exemplo n.º 19
0
def get_predictions(abert, reader, device):
    """ Generates predictions from a trained model on a reader """
    dev = reader.read('raw_data/drop/drop_dataset_dev.json')
    iterator = BasicIterator(batch_size=1)
    iterator.index_with(Vocabulary())

    dev_iter = iterator(dev, num_epochs=1)
    dev_batches = [batch for batch in dev_iter]
    dev_batches = move_to_device(dev_batches, device)

    predictions = {}
    with torch.no_grad():
        for batch in tqdm(dev_batches):
            out = abert(**batch)
            assert len(out['question_id']) == 1
            assert len(out['answer']) == 1

            query_id = out['question_id'][0]
            prediction = out['answer'][0]['value']
            predictions[query_id] = prediction
    torch.cuda.empty_cache()
    return predictions
Exemplo n.º 20
0
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == ["a", "c"]

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace='a') == 3
        assert vocab2.get_token_from_index(0, namespace='a') == 'a0'
        assert vocab2.get_token_from_index(1, namespace='a') == 'a1'
        assert vocab2.get_token_from_index(2, namespace='a') == 'a2'
        assert vocab2.get_token_index('a0', namespace='a') == 0
        assert vocab2.get_token_index('a1', namespace='a') == 1
        assert vocab2.get_token_index('a2', namespace='a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace='b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace='b') == 'b2'
        assert vocab2.get_token_from_index(3, namespace='b') == 'b3'
        assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1
        assert vocab2.get_token_index('b2', namespace='b') == 2
        assert vocab2.get_token_index('b3', namespace='b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
Exemplo n.º 21
0
def eval_ner(learner, id2label, is_test=False):
    # set up AllenNLP evaluation metric
    mode = 'Test' if is_test else 'Validation'
    id2label = [f'B-{l}' if l in [PAD, BOS_LABEL] else l for l in id2label]
    namespace = 'ner_labels'
    label_vocab = Vocabulary(non_padded_namespaces=(namespace, ),
                             tokens_to_add={namespace: id2label
                                            })  # create the tag vocabulary
    f1_metric = SpanBasedF1Measure(label_vocab,
                                   tag_namespace=namespace,
                                   ignore_classes=[PAD, BOS_LABEL])
    preds, y = learner.predict_with_targs(is_test=is_test)
    # convert to tensors, add a batch dimension
    preds_tensor = torch.from_numpy(preds).unsqueeze(0)
    y_tensor = torch.from_numpy(y).unsqueeze(0)
    f1_metric(preds_tensor, y_tensor)
    all_metrics = f1_metric.get_metric(reset=True)
    print(f'{mode} f1 measure overall:', all_metrics['f1-measure-overall'])
    print(all_metrics)
    preds_fwd_ids = [np.argmax(p) for p in preds]
    acc_fwd = accuracy_score(y, preds_fwd_ids)
    print(f'{mode} token-level accuracy of NER model: %.4f.' % acc_fwd)
Exemplo n.º 22
0
def test_unlabeled():
    from pprint import pprint

    params = Params({
        'token_embedder': {
            'num_embeddings': 4,
            'embedding_dim': 300
        },
        'code_dist_type': 'gaussian'
    })
    vocab = Vocabulary()
    while True:
        vocab_size = vocab.get_vocab_size()
        if vocab_size == 4:
            break
        vocab.add_token_to_namespace('a' + str(vocab_size))
    model = DeconvSNLIModel(params=params, vocab=vocab)
    premise = {'tokens': torch.randint(low=0, high=4, size=(5, 29))}
    hypothesis = {'tokens': torch.randint(low=0, high=4, size=(5, 29))}
    output = model(premise=premise, hypothesis=hypothesis, label=None)
    pprint(output)
    pprint(model.get_metrics())
Exemplo n.º 23
0
    def test_token_type_ids(self):
        tokenizer = WordTokenizer()

        sentence = "the laziest  fox"

        tokens = tokenizer.tokenize(sentence)
        #           2   15 10 11  6   17    2   15 10 11  6
        #           the laziest   fox [SEP] the laziest   fox
        tokens = tokens + [
            Token("[SEP]")
        ] + tokens  # have to do this b/c tokenizer splits `[SEP]` in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        #                                          [CLS] 2, 15, 10, 11, 6, 17, 2  15, 10, 11, 6, [SEP]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1
        ]  #pylint: disable=bad-whitespace
Exemplo n.º 24
0
    def test_sliding_window_with_batch(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(
            str(vocab_path), truncate_long_sequences=False, max_pieces=8
        )

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})}
        )

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert bert_vectors is not None

        # Testing with token_type_ids
        bert_vectors = token_embedder(
            tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens["bert-type-ids"]
        )
        assert bert_vectors is not None
Exemplo n.º 25
0
    def test_set_from_file_reads_non_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR + 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('B-PERS\n')
            vocab_file.write('I-PERS\n')
            vocab_file.write('O\n')
            vocab_file.write('B-ORG\n')
            vocab_file.write('I-ORG\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags')
        assert vocab.get_token_index("B-PERS", namespace='tags') == 0
        assert vocab.get_token_index("I-PERS", namespace='tags') == 1
        assert vocab.get_token_index("O", namespace='tags') == 2
        assert vocab.get_token_index("B-ORG", namespace='tags') == 3
        assert vocab.get_token_index("I-ORG", namespace='tags') == 4
        assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS"
        assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS"
        assert vocab.get_token_from_index(2, namespace='tags') == "O"
        assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG"
        assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
Exemplo n.º 26
0
    def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField([Token(t) for t in ["a", "b"]],
                               {"tokens": SingleIdTokenIndexer("tokens")})
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"type": "extend", "directory": vocab_dir})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"type": "extend"})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)
Exemplo n.º 27
0
    def test_saving_and_loading(self):

        vocab_dir = self.TEST_DIR / "vocab_save"

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_tokens_to_namespace(
            ["a0", "a1", "a2"], namespace="a"
        )  # non-padded, should start at 0
        vocab.add_tokens_to_namespace(["b2", "b3"], namespace="b")  # padded, should start at 2

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == {"a", "c"}

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace="a") == 3
        assert vocab2.get_token_from_index(0, namespace="a") == "a0"
        assert vocab2.get_token_from_index(1, namespace="a") == "a1"
        assert vocab2.get_token_from_index(2, namespace="a") == "a2"
        assert vocab2.get_token_index("a0", namespace="a") == 0
        assert vocab2.get_token_index("a1", namespace="a") == 1
        assert vocab2.get_token_index("a2", namespace="a") == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace="b") == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace="b") == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace="b") == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace="b") == "b2"
        assert vocab2.get_token_from_index(3, namespace="b") == "b3"
        assert vocab2.get_token_index(vocab._padding_token, namespace="b") == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace="b") == 1
        assert vocab2.get_token_index("b2", namespace="b") == 2
        assert vocab2.get_token_index("b3", namespace="b") == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
Exemplo n.º 28
0
    def test_starting_ending_offsets(self):
        tokenizer = BertPreTokenizer()

        #           2   3     5     6   8      9    2  15 10 11 14   1
        sentence = "the quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # 16 = [CLS], 17 = [SEP]
        assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
        assert indexed_tokens["offsets"] == [1, 2, 3, 4, 5, 6, 7, 10, 11, 12]

        token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
        assert indexed_tokens["offsets"] == [1, 2, 3, 4, 5, 6, 7, 8, 11, 12]
Exemplo n.º 29
0
def main(weights_file, device):
    tokenizer = BertDropTokenizer('bert-base-uncased')
    token_indexer = BertDropTokenIndexer('bert-base-uncased')
    reader = BertDropReader(tokenizer, {'tokens': token_indexer},
                            extra_numbers=[100, 1],
                            exp_search='template')

    abert = NumericallyAugmentedBERTT(Vocabulary(),
                                      'bert-base-uncased',
                                      special_numbers=[100, 1])
    abert.load_state_dict(torch.load(weights_file, map_location='cpu'))
    abert.to(device).eval()

    predictions = get_predictions(abert, reader, device)

    # Write out predictions to file
    serialization_dir = '/'.join(weights_file.split('/')[:-1])
    predictions_file = weights_file.split('/')[-1].split(
        '.')[0] + '_dev_pred.json'
    predictions_file = join(serialization_dir, predictions_file)

    with open(predictions_file, "w") as writer:
        writer.write(json.dumps(predictions, indent=4) + "\n")
Exemplo n.º 30
0
    def test_starting_ending_offsets(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #           2   3     5     6   8      9    2  15 10 11 14   1
        sentence = "the quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        assert indexed_tokens["bert"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
        assert indexed_tokens["bert-offsets"] == [1, 2, 3, 4, 5, 6, 7, 10, 11, 12]

        token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        assert indexed_tokens["bert"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
        assert indexed_tokens["bert-offsets"] == [1, 2, 3, 4, 5, 6, 7, 8, 11, 12]