Python ELMoTokenCharactersIndexer示例，allennlp.data.token_indexers.ELMoTokenCharactersIndexer Python示例

示例#1

0

显示文件

    def test_elmo_as_array_produces_token_sequence(self):  # pylint: disable=invalid-name
        indexer = ELMoTokenCharactersIndexer()
        indices = [
            indexer.token_to_indices(Token(token), Vocabulary())
            for token in ['Second', '.']
        ]
        padded_tokens = indexer.pad_token_sequence(indices,
                                                   desired_num_tokens=3,
                                                   padding_lengths={})
        expected_padded_tokens = [
            [
                259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
            ],
            [
                259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
            ],
            [
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0
            ]
        ]

        assert padded_tokens == expected_padded_tokens

示例#2

0

显示文件

    def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token('Second'), Token('.')]
        indices = indexer.tokens_to_indices(tokens, Vocabulary(), "test-elmo")["test-elmo"]
        padded_tokens = indexer.pad_token_sequence({'test-elmo': indices},
                                                   desired_num_tokens={'test-elmo': 3},
                                                   padding_lengths={})
        expected_padded_tokens = [[259, 84, 102, 100, 112, 111, 101, 260, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261],
                                  [259, 47, 260, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0]]

        assert padded_tokens['test-elmo'] == expected_padded_tokens

示例#3

0

显示文件

文件： end2end_model.py 项目： pig7788/LEONA

    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.tokenizer_space = WhitespaceTokenizer()
        self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md",
                                              pos_tags=True,
                                              split_on_spaces=True)
        self.token_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6),
            'pos_tags':
            SingleIdTokenIndexer(namespace='pos_tag_vocab',
                                 feature_name='tag_'),
            'ner_tags':
            SingleIdTokenIndexer(namespace='ner_tag_vocab',
                                 feature_name='ent_type_')
        }

        self.slot_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6)
        }

示例#4

0

显示文件

文件： elmo_indexer_test.py 项目： Jordan-Sauchuk/allennlp

    def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name
        indexer = ELMoTokenCharactersIndexer()
        indices = [
                indexer.token_to_indices(Token(token), Vocabulary())
                for token in ['Second', '.']
        ]
        padded_tokens = indexer.pad_token_sequence(indices,
                                                   desired_num_tokens=3,
                                                   padding_lengths={})
        expected_padded_tokens = [[259, 84, 102, 100, 112, 111, 101, 260, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261],
                                  [259, 47, 260, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0]]

        assert padded_tokens == expected_padded_tokens

示例#5

0

显示文件

文件： elmo_indexer_test.py 项目： solversa/allennlp

 def test_elmo_empty_token_list(self):
     # Basic test
     indexer = ELMoTokenCharactersIndexer()
     assert {"elmo_tokens": []} == indexer.get_empty_token_list()
     # Real world test
     indexer = {"elmo": indexer}
     tokens_1 = TextField([Token("Apple")], indexer)
     targets_1 = ListField([TextField([Token("Apple")], indexer)])
     tokens_2 = TextField([Token("Screen"), Token("device")], indexer)
     targets_2 = ListField([
         TextField([Token("Screen")], indexer),
         TextField([Token("Device")], indexer)
     ])
     instance_1 = Instance({"tokens": tokens_1, "targets": targets_1})
     instance_2 = Instance({"tokens": tokens_2, "targets": targets_2})
     a_batch = Batch([instance_1, instance_2])
     a_batch.index_instances(Vocabulary())
     batch_tensor = a_batch.as_tensor_dict()
     elmo_target_token_indices = batch_tensor["targets"]["elmo"][
         "elmo_tokens"]
     # The TextField that is empty should have been created using the
     # `get_empty_token_list` and then padded with zeros.
     empty_target = elmo_target_token_indices[0][1].numpy()
     np.testing.assert_array_equal(np.zeros((1, 50)), empty_target)
     non_empty_targets = [
         elmo_target_token_indices[0][0],
         elmo_target_token_indices[1][0],
         elmo_target_token_indices[1][1],
     ]
     for non_empty_target in non_empty_targets:
         with pytest.raises(AssertionError):
             np.testing.assert_array_equal(np.zeros((1, 50)),
                                           non_empty_target)

示例#6

0

显示文件

文件： elmo_indexer_test.py 项目： apmoore1/allennlp

    def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token('Second'), Token('.')]
        indices = indexer.tokens_to_indices(tokens, Vocabulary(), "test-elmo")["test-elmo"]
        padded_tokens = indexer.pad_token_sequence({'test-elmo': indices},
                                                   desired_num_tokens={'test-elmo': 3},
                                                   padding_lengths={})
        expected_padded_tokens = [[259, 84, 102, 100, 112, 111, 101, 260, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261],
                                  [259, 47, 260, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0]]

        assert padded_tokens['test-elmo'] == expected_padded_tokens

示例#7

0

显示文件

文件： elmo_indexer_test.py 项目： shenyong123/GEML-MDG

 def test_elmo_indexer_with_additional_tokens(self):
     indexer = ELMoTokenCharactersIndexer(tokens_to_add={"<first>": 1})
     tokens = [Token("<first>")]
     indices = indexer.tokens_to_indices(tokens, Vocabulary())
     expected_indices = [
         [
             259,
             2,
             260,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
             261,
         ]
     ]
     assert indices["tokens"] == expected_indices

示例#8

0

显示文件

文件： elmo_indexer_test.py 项目： solversa/allennlp

 def test_unicode_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token(chr(256) + "t")],
                                         Vocabulary())
     expected_indices = [
         259,
         197,
         129,
         117,
         260,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
     ]
     assert indices == {"elmo_tokens": [expected_indices]}

示例#9

0

显示文件

 def test_eos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token("</S>")], Vocabulary(),
                                         "test-eos")
     expected_indices = [
         259,
         258,
         260,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
     ]
     assert indices == {"test-eos": [expected_indices]}

示例#10

0

显示文件

文件： elmo_indexer_test.py 项目： solversa/allennlp

 def test_bos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token("<S>")], Vocabulary())
     expected_indices = [
         259,
         257,
         260,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
         261,
     ]
     assert indices == {"elmo_tokens": [expected_indices]}

示例#11

0

显示文件

文件： elmo_indexer_test.py 项目： Jordan-Sauchuk/allennlp

 def test_bos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.token_to_indices(Token('<S>'), Vocabulary())
     expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261]
     assert indices == expected_indices

示例#12

0

显示文件

文件： elmo_indexer_test.py 项目： Jordan-Sauchuk/allennlp

 def test_unicode_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.token_to_indices(Token(chr(256) + 't'), Vocabulary())
     expected_indices = [259, 197, 129, 117, 260, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261]
     assert indices == expected_indices

示例#13

0

显示文件

 def test_bos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.token_to_indices(Token('<S>'), Vocabulary())
     expected_indices = [
         259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == expected_indices

示例#14

0

显示文件

文件： elmo_indexer_test.py 项目： apmoore1/allennlp

 def test_bos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token('<S>')], Vocabulary(), "test-elmo")
     expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261]
     assert indices == {"test-elmo": [expected_indices]}

示例#15

0

显示文件

 def test_unicode_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.token_to_indices(Token(chr(256) + 't'), Vocabulary())
     expected_indices = [
         259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == expected_indices

示例#16

0

显示文件

 def test_elmo_indexer_with_additional_tokens(self):
     indexer = ELMoTokenCharactersIndexer(tokens_to_add={'<first>': 1})
     tokens = [Token('<first>')]
     indices = indexer.tokens_to_indices(tokens, Vocabulary(), "test-elmo")["test-elmo"]
     expected_indices = [[259, 2, 260, 261, 261, 261, 261, 261, 261, 261,
                          261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                          261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                          261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                          261, 261, 261, 261, 261, 261, 261, 261, 261, 261]]
     assert indices == expected_indices

示例#17

0

显示文件

 def test_bos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token('<S>')], Vocabulary(), "test-elmo")
     expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261]
     assert indices == {"test-elmo": [expected_indices]}

示例#18

0

显示文件

文件： elmo_indexer_test.py 项目： apmoore1/allennlp

 def test_unicode_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token(chr(256) + 't')], Vocabulary(), "test-unicode")
     expected_indices = [259, 197, 129, 117, 260, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261, 261, 261, 261, 261,
                         261, 261, 261, 261, 261]
     assert indices == {"test-unicode": [expected_indices]}

示例#19

0

显示文件

文件： elmo_indexer_test.py 项目： ryan-leung/ml_monorepo

 def test_unicode_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token(unichr(256) + u't')],
                                         Vocabulary(), u"test-unicode")
     expected_indices = [
         259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == {u"test-unicode": [expected_indices]}

示例#20

0

显示文件

    def initialize(self):
        print('Data reader initialization ...')
        self.cursor = fever_db.get_cursor()

        # Prepare Data
        token_indexers = {
            'tokens': \
                SingleIdTokenIndexer(namespace='tokens'),
            'elmo_chars': \
                ELMoTokenCharactersIndexer(namespace='elmo_characters')
        }
        self.fever_data_reader = SSelectorReader(token_indexers=token_indexers,
                                                 lazy=cfg.lazy)

        vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT \
                                                   / 'vocab_cache' \
                                                   / 'nli_basic')
        # THis is important
        ns = 'selection_labels'
        vocab.add_token_to_namespace('true', namespace=ns)
        vocab.add_token_to_namespace('false', namespace=ns)
        vocab.add_token_to_namespace('hidden', namespace=ns)
        vocab.change_token_with_index_to_namespace('hidden', -2, namespace=ns)
        # Label value

        vocab.get_index_to_token_vocabulary(ns)

        self.vocab = vocab
        self.weight_dict = weight_dict
        self.initialized = True

示例#21

0

显示文件

文件： classifier_testing.py 项目： shanmon110/RPDNN

    def test_on_test_set(self):
        # model_weight_file = os.path.join(os.path.dirname(__file__),  '..', "output", "201905290138", "weights_best.th")
        # vocab_dir_path = os.path.join(os.path.dirname(__file__),  '..', "output", "201905290138", "vocabulary")
        model_weight_file = "C:\\Data\\rumourDNN_models\\output\\bostonbombings-201906241245\\weights_best.th"
        vocab_dir_path = "C:\\Data\\rumourDNN_models\\output\\bostonbombings-201906241245\\vocabulary"

        model, rumor_dnn_predictor = load_classifier_from_archive(
            vocab_dir_path=vocab_dir_path, model_weight_file=model_weight_file)

        evaluation_data_path = os.path.join(os.path.dirname(__file__), '..',
                                            "data", "test", "charliehebdo.csv")

        elmo_token_indexer = ELMoTokenCharactersIndexer()
        rumor_train_set_reader = RumorTweetsDataReader(
            token_indexers={'elmo': elmo_token_indexer})
        test_instances = rumor_train_set_reader.read(evaluation_data_path)

        from training_util import evaluate
        data_iterator = BucketIterator(batch_size=200,
                                       sorting_keys=[("sentence", "num_tokens")
                                                     ])
        data_iterator.index_with(model.vocab)
        metrics = evaluate(model, test_instances, data_iterator, -1, "")

        timestamped_print("Finished evaluating.")
        timestamped_print("Metrics:")
        for key, metric in metrics.items():
            timestamped_print("%s: %s" % (key, metric))

示例#22

0

显示文件

def build_elmo_dataset_reader(lower=False) -> DatasetReader:
    tokenizer = WhitespaceTokenizer()
    token_indexers = {'bert_tokens': ELMoTokenCharactersIndexer()}
    return ClassificationDatasetReader(tokenizer=tokenizer,
                                       token_indexers=token_indexers,
                                       max_tokens=300,
                                       lower=lower)

示例#23

0

显示文件

def build_indexers(args):
    indexers = {}
    if args.input_module in ["scratch", "glove", "fastText"]:
        indexers["words"] = SingleIdTokenIndexer()
    elif args.input_module in ["elmo", "elmo-chars-only"]:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}

    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")

    if input_module_uses_transformers(args.input_module):
        assert (
            not indexers
        ), "transformers modules like BERT/XLNet are not supported alongside other "
        "indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "transformers models use custom tokenization for each model, so tokenizer "
            "must match the specified model.")
        tokenizer_name = input_module_tokenizer_name(args.input_module)
        indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name)
    return indexers

示例#24

0

显示文件

文件： preprocess.py 项目： Zhgwen/olfmlm

def build_indexers(args):
    indexers = {}
    if not args.input_module.startswith("bert") and args.input_module not in [
            "elmo", "gpt"
    ]:
        indexers["words"] = SingleIdTokenIndexer()
    if args.input_module == "elmo":
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}
    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")
    if args.input_module == "gpt":
        assert (
            not indexers
        ), "OpenAI transformer is not supported alongside other indexers due to tokenization."
        assert (
            args.tokenizer == "OpenAI.BPE"
        ), "OpenAI transformer uses custom BPE tokenization. Set tokenizer=OpenAI.BPE."
        indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer(
            "openai_bpe")
    if args.input_module.startswith("bert"):
        assert not indexers, "BERT is not supported alongside other indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "BERT models use custom WPM tokenization for "
            "each model, so tokenizer must match the "
            "specified BERT model.")
        indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer(
            args.input_module)
    return indexers

示例#25

0

显示文件

文件： vcr.py 项目： jaeyun95/KnowledgeExtraction_vers2.0

    def __init__(self,
                 split,
                 mode,
                 only_use_relevant_dets=True,
                 add_image_as_a_box=True):
        self.only_use_relevant_dets = only_use_relevant_dets
        self.mode = mode
        self.split = split
        self.add_image_as_a_box = add_image_as_a_box
        with open(os.path.join(VCR_ANNOTS_DIR, '{}.jsonl'.format(split)),
                  'r') as f:
            self.items = [json.loads(s) for s in f]

        self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()}
        self.vocab = Vocabulary()

        with open(
                os.path.join(os.path.dirname(VCR_ANNOTS_DIR),
                             'cocoontology.json'), 'r') as f:
            coco = json.load(f)

        self.coco_objects = ['__background__'] + [
            x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0]))
        ]
        self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)}

示例#26

0

显示文件

    def __init__(self,
                 split,
                 mode,
                 only_use_relevant_dets=True,
                 add_image_as_a_box=True,
                 embs_to_load='bert_da',
                 conditioned_answer_choice=0):
        """

        :param split: train, val, or test
        :param mode: answer or rationale
        :param only_use_relevant_dets: True, if we will only use the detections mentioned in the question and answer.
                                       False, if we should use all detections.
        :param add_image_as_a_box:     True to add the image in as an additional 'detection'. It'll go first in the list
                                       of objects.
        :param embs_to_load: Which precomputed embeddings to load.
        :param conditioned_answer_choice: If you're in test mode, the answer labels aren't provided, which could be
                                          a problem for the QA->R task. Pass in 'conditioned_answer_choice=i'
                                          to always condition on the i-th answer.
        """
        self.split = split
        self.mode = mode
        self.only_use_relevant_dets = only_use_relevant_dets
        print("Only relevant dets"
              if only_use_relevant_dets else "Using all detections",
              flush=True)

        self.add_image_as_a_box = add_image_as_a_box
        self.conditioned_answer_choice = conditioned_answer_choice
        with open(
                os.path.join(
                    VCR_ANNOTS_DIR,
                    '/media/ailab/songyoungtak/vcr_new/new/add_keyword/{}.jsonl'
                    .format(split)), 'r') as f:
            self.items = [json.loads(s) for s in f]

        if split not in ('test', 'train_scene_version', 'val_scene_version'):
            raise ValueError(
                "Mode must be in test, train, or val. Supplied {}".format(
                    mode))

        if mode not in ('answer', 'rationale'):
            raise ValueError("split must be answer or rationale")

        self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()}
        self.vocab = Vocabulary()

        with open(
                os.path.join(os.path.dirname(VCR_ANNOTS_DIR),
                             'cocoontology.json'), 'r') as f:
            coco = json.load(f)
        self.coco_objects = ['__background__'] + [
            x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0]))
        ]
        self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)}

        self.embs_to_load = embs_to_load
        self.h5fn = os.path.join(
            VCR_ANNOTS_DIR, f'{self.embs_to_load}_{self.mode}_{self.split}.h5')
        print("Loading embeddings from {}".format(self.h5fn), flush=True)

示例#27

0

显示文件

文件： preprocess.py 项目： Narsil/jiant

def build_indexers(args):
    indexers = {}
    if not args.word_embs == "none":
        indexers["words"] = SingleIdTokenIndexer()
    if args.elmo:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}
    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")
    if args.openai_transformer:
        assert not indexers, ("OpenAI transformer is not supported alongside"
                              " other indexers due to tokenization!")
        assert args.tokenizer == "OpenAI.BPE", (
            "OpenAI transformer is not supported alongside"
            " other indexers due to tokenization!")
        indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer(
            "openai_bpe")
    if args.bert_model_name:
        assert not indexers, ("BERT is not supported alongside"
                              " other indexers due to tokenization!")
        assert args.tokenizer == args.bert_model_name, (
            "BERT models use custom WPM tokenization for "
            "each model, so tokenizer must match the "
            "specified BERT model.")
        indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer(
            args.bert_model_name)
    return indexers

示例#28

0

显示文件

文件： pnet_ontonotes.py 项目： salman1993/ProtoNER

    def from_params(cls, params: Params) -> "PnetOntoDatasetReader":
        # token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
        token_indexers = {
            "tokens": SingleIdTokenIndexer(lowercase_tokens=True),
            "token_characters": TokenCharactersIndexer(),
            "elmo": ELMoTokenCharactersIndexer(),
        }
        valid_class = params.pop("valid_class")
        random_seed = params.pop("random_seed")
        drop_empty = params.pop("drop_empty")
        valid_part = params.pop("valid_part")

        tag_label = params.pop("tag_label", None)
        feature_labels = params.pop("feature_labels", ())
        lazy = params.pop("lazy", False)
        params.assert_empty(cls.__name__)
        return PnetOntoDatasetReader(
            token_indexers=token_indexers,
            valid_class=valid_class,
            random_seed=random_seed,
            drop_empty=drop_empty,
            valid_part=valid_part,
            tag_label=tag_label,
            feature_labels=feature_labels,
            lazy=lazy,
        )

示例#29

0

显示文件

def spectrum_eval_manual_check():
    batch_size = 64
    lazy = True

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-17-12:10:35_mesim_elmo/i(34800)_epoch(5)_dev(0.5563056305630563)_loss(1.6648460462434564)_seed(12)"

    # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_15:52:19_r/dev_sent.jsonl"
    IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16:34:19_r/dev_sent.jsonl"
    # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16-34-19_r/dev_sent.jsonl"
    dev_sent_result_lsit = common.load_jsonl(IN_FILE)

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    for sc_prob in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]:
        upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_sent_result_lsit, scale_prob=sc_prob,
                                                   delete_prob=False)
        dev_fever_data_reader = BasicReader(token_indexers=token_indexers, lazy=lazy)
        complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list)
        dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

        eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
        builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

        print("------------------------------------")
        print("Scaling_prob:", sc_prob)
        eval_mode = {'check_sent_id_correct': True, 'standard': True}
        print(c_scorer.fever_score(builded_dev_data, config.T_FEVER_DEV_JSONL, mode=eval_mode))
        # del upstream_dev_list
        # del complete_upstream_dev_data
        del dev_fever_data_reader
        del dev_instances
        print("------------------------------------")

示例#30

0

显示文件

def eval_fever():
    # save_path = "/home/easonnie/projects/MiscEnc/saved_models/06-07-21:58:06_esim_elmo/i(60900)_epoch(4)_um_dev(80.03458096013019)_m_dev(79.174732552216)_seed(12)"
    save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:40:01_esim_elmo_linear_amr_cs_score_filtering_0.5/i(5900)_epoch(3)_um_dev(39.73759153783564)_m_dev(40.18339276617422)_seed(12)"
    # save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:42:34_esim_elmo_cs_score_filtering_0.7/i(1300)_epoch(4)_um_dev(32.55695687550855)_m_dev(32.42995415180846)_seed(12)"
    batch_size = 32

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    csnli_dataset_reader = CNLIReader(token_indexers=token_indexers,
                                      example_filter=lambda x: float(x['cs_score']) >= 0.7)

    # mnli_train_data_path = config.DATA_ROOT / "mnli/multinli_1.0_train.jsonl"
    mnli_m_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_mdev.jsonl.cs"
    mnli_um_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_umdev.jsonl.cs"

    # mnli_train_instances = csnli_dataset_reader.read(mnli_train_data_path)
    mnli_m_dev_instances = csnli_dataset_reader.read(mnli_m_dev_data_path)
    mnli_um_dev_instances = csnli_dataset_reader.read(mnli_um_dev_data_path)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300)

    model.load_state_dict(torch.load(save_path))

    model.display()
    model.to(device)

    # Create Log File

    criterion = nn.CrossEntropyLoss()

    eval_iter = biterator(mnli_m_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    m_dev_score, m_dev_loss = eval_model(model, eval_iter, criterion)

    eval_iter = biterator(mnli_um_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    um_dev_score, um_dev_loss = eval_model(model, eval_iter, criterion)

    print(f"Dev(M):{m_dev_score}/{m_dev_loss}")
    print(f"Dev(UM):{um_dev_score}/{um_dev_loss}")

示例#31

0

显示文件

    def __init__(self,
                 split,
                 only_use_relevant_dets=True,
                 add_image_as_a_box=True,
                 embs_to_load='bert_da',
                 conditioned_answer_choice=0):
        """

        :param split: train, val, or test
        :param mode: answer or rationale
        :param only_use_relevant_dets: True, if we will only use the detections mentioned in the question and answer.
                                       False, if we should use all detections.
        :param add_image_as_a_box:     True to add the image in as an additional 'detection'. It'll go first in the list
                                       of objects.
        :param embs_to_load: Which precomputed embeddings to load.
        :param conditioned_answer_choice: If you're in test mode, the answer labels aren't provided, which could be
                                          a problem for the QA->R task. Pass in 'conditioned_answer_choice=i'
                                          to always condition on the i-th answer.       这是啥意思？？？？？？？？？？怎么test的时候还有这种操作？？？？？？？？？解释→  https://groups.google.com/forum/?hl=en#!topic/visualcommonsense/lxEgFYRz5ho
        """
        if split not in ('test', 'train', 'val'):
            raise ValueError(
                "Mode must be in test, train, or val. Supplied {}".format(
                    'answer-rationale'))
        print("Loading {} embeddings".format(split), flush=True)
        self.split = split
        self.only_use_relevant_dets = only_use_relevant_dets
        print("Only relevant dets"
              if only_use_relevant_dets else "Using all detections",
              flush=True)

        self.add_image_as_a_box = add_image_as_a_box
        self.conditioned_answer_choice = conditioned_answer_choice

        with open(
                os.path.join(VCR_ANNOTS_DIR, split, '{}.jsonl'.format(split)),
                'r') as f:
            self.items = np.array(list(f))

        self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()}
        self.vocab = Vocabulary()

        with open(
                os.path.join(VCR_ANNOTS_DIR, 'dataloaders',
                             'cocoontology.json'), 'r') as f:
            coco = json.load(f)
        self.coco_objects = ['__background__'] + [
            x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0]))
        ]  # 这里提到了background，思考一下以后如何利用background
        self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)}

        self.embs_to_load = embs_to_load
        self.h5fn_answer = os.path.join(
            VCR_ANNOTS_DIR, self.split,
            f'{self.embs_to_load}_answer_{self.split}.h5')
        self.h5fn_rationale = os.path.join(
            VCR_ANNOTS_DIR, self.split,
            f'{self.embs_to_load}_rationale_{self.split}.h5')
        self.h5fn_image = os.path.join(VCR_ANNOTS_DIR, self.split,
                                       f'image_feature_{self.split}.h5')

示例#32

0

显示文件

文件： bert.py 项目： forrestdavis/ExperimentNorming

def run_ELMo_RSA(stim_file, header=False, filter_file=None):

    EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE)

    #Get tokenizer
    tokenizer = WhitespaceTokenizer()

    #Load model
    ##ELMo OG
    elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
    elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json'

    #ELMo Small
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'

    #ELMo Medium
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json'

    #ELMo OG (5.5B)
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

    elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file,
                                       weight_file=elmo_weight_file,
                                       dropout=0.0)
    embedder = BasicTextFieldEmbedder(
        token_embedders={'elmo_tokens': elmo_embedding})

    for x in range(len(EXP.SENTS)):
        sentences = list(EXP.SENTS[x])
        target = sentences[0]
        sentence = sentences[1]

        #GET BASELINE
        token_indexer = ELMoTokenCharactersIndexer()
        vocab = Vocabulary()

        target_tokens = tokenizer.tokenize(target)
        target_text_field = TextField(target_tokens,
                                      {'elmo_tokens': token_indexer})
        target_text_field.index(vocab)
        target_token_tensor = target_text_field.as_tensor(
            target_text_field.get_padding_lengths())
        target_tensor_dict = target_text_field.batch_tensors(
            [target_token_tensor])

        target_embedding = embedder(target_tensor_dict)[0]
        baseline = target_embedding[-1].data.cpu().squeeze()

        #GET SIMS
        sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder)
        values = get_dummy_values(sentence)

        EXP.load_IT('elmo', x, values, False, sims)

    return EXP

示例#33

0

显示文件

def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Use the 'Small' pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

示例#34

0

显示文件

def hidden_eval_fever():
    batch_size = 64
    lazy = True

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)"

    dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    p_dict = wn_persistent_api.persistence_load()

    dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360)

    complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file)
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)
    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    # dev_biterator = BasicIterator(batch_size=batch_size * 2)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                               1024 + 300),
                  weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300)

    print("Model Max length:", model.max_l)
    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    eval_mode = {'check_sent_id_correct': True, 'standard': True}

    for item in builded_dev_data:
        del item['label']

    print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))

示例#35

0

显示文件

    def __init__(self) -> None:
        # we use simple word tokenizer to split sentence to words
        self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

        #initialize indexers
        singleIdIndexer = SingleIdTokenIndexer()
        elmoIndexer = ELMoTokenCharactersIndexer()
        self.indexers = {}
        self.indexers["tokens"] = singleIdIndexer
        self.indexers["elmo_characters"] = elmoIndexer

示例#36

0

显示文件

def get_score_multihop(t_data_file, additional_file, model_path, item_key='prioritized_docids_aside', top_k=6):
    batch_size = 64
    lazy = True

    SAVE_PATH = model_path
    print("Model From:", SAVE_PATH)

    additional_sentence_list = get_additional_list(t_data_file, additional_file, item_key=item_key, top_k=top_k)

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    print("Additional Dev size:", len(additional_sentence_list))
    dev_instances = dev_fever_data_reader.read(additional_sentence_list)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    additional_sentence_list = hidden_eval(model, eval_iter, additional_sentence_list)

    return additional_sentence_list

示例#37

0

显示文件

文件： create_elmo_embeddings_from_vocab.py 项目： apmoore1/allennlp

def main(vocab_path: str,
         elmo_config_path: str,
         elmo_weights_path: str,
         output_dir: str,
         batch_size: int,
         device: int,
         use_custom_oov_token: bool = False):
    """
    Creates ELMo word representations from a vocabulary file. These
    word representations are _independent_ - they are the result of running
    the CNN and Highway layers of the ELMo model, but not the Bidirectional LSTM.
    ELMo requires 2 additional tokens: <S> and </S>. The first token
    in this file is assumed to be an unknown token.

    This script produces two artifacts: A new vocabulary file
    with the <S> and </S> tokens inserted and a glove formatted embedding
    file containing word : vector pairs, one per line, with all values
    separated by a space.
    """

    # Load the vocabulary words and convert to char ids
    with open(vocab_path, 'r') as vocab_file:
        tokens = vocab_file.read().strip().split('\n')

    # Insert the sentence boundary tokens which elmo uses at positions 1 and 2.
    if tokens[0] != DEFAULT_OOV_TOKEN and not use_custom_oov_token:
        raise ConfigurationError("ELMo embeddings require the use of a OOV token.")

    tokens = [tokens[0]] + ["<S>", "</S>"] + tokens[1:]

    indexer = ELMoTokenCharactersIndexer()
    indices = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary(), "indices")["indices"]
    sentences = []
    for k in range((len(indices) // 50) + 1):
        sentences.append(indexer.pad_token_sequence(indices[(k * 50):((k + 1) * 50)],
                                                    desired_num_tokens=50,
                                                    padding_lengths={}))

    last_batch_remainder = 50 - (len(indices) % 50)
    if device != -1:
        elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path,
                                                    elmo_weights_path).cuda(device)
    else:
        elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path,
                                                    elmo_weights_path)

    all_embeddings = []
    for i in range((len(sentences) // batch_size) + 1):
        array = numpy.array(sentences[i * batch_size: (i + 1) * batch_size])
        if device != -1:
            batch = torch.from_numpy(array).cuda(device)
        else:
            batch = torch.from_numpy(array)

        token_embedding = elmo_token_embedder(batch)['token_embedding'].data

        # Reshape back to a list of words of shape (batch_size * 50, encoding_dim)
        # We also need to remove the <S>, </S> tokens appended by the encoder.
        per_word_embeddings = token_embedding[:, 1:-1, :].contiguous().view(-1, token_embedding.size(-1))

        all_embeddings.append(per_word_embeddings)

    # Remove the embeddings associated with padding in the last batch.
    all_embeddings[-1] = all_embeddings[-1][:-last_batch_remainder, :]

    embedding_weight = torch.cat(all_embeddings, 0).cpu().numpy()

    # Write out the embedding in a glove format.
    os.makedirs(output_dir, exist_ok=True)
    with gzip.open(os.path.join(output_dir, "elmo_embeddings.txt.gz"), 'wb') as embeddings_file:
        for i, word in enumerate(tokens):
            string_array = " ".join([str(x) for x in list(embedding_weight[i, :])])
            embeddings_file.write(f"{word} {string_array}\n".encode('utf-8'))

    # Write out the new vocab with the <S> and </S> tokens.
    _, vocab_file_name = os.path.split(vocab_path)
    with open(os.path.join(output_dir, vocab_file_name), "w") as new_vocab_file:
        for word in tokens:
            new_vocab_file.write(f"{word}\n")