def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt') params = Params({ 'pretrained_file': file_uri, 'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels') none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels') indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]} assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
def get_vocab(word2freq, max_v_sizes): '''Build vocabulary''' vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes['word']) words_by_freq = [(word, freq) for word, freq in word2freq.items()] words_by_freq.sort(key=lambda x: x[1], reverse=True) for word, _ in words_by_freq[:max_v_sizes['word']]: vocab.add_token_to_namespace(word, 'tokens') log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens')) return vocab
def test_token_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags') vocab.add_token_to_namespace('ORG', namespace='ner_tags') indexer = NerTagIndexer() assert indexer.token_to_indices(tokens[1], vocab) == person_index assert indexer.token_to_indices(tokens[-1], vocab) == none_index
def test_index_converts_field_correctly(self): vocab = Vocabulary() b_index = vocab.add_token_to_namespace("B", namespace='*labels') i_index = vocab.add_token_to_namespace("I", namespace='*labels') o_index = vocab.add_token_to_namespace("O", namespace='*labels') tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels") sequence_label_field.index(vocab) # pylint: disable=protected-access assert sequence_label_field._indexed_labels == [b_index, i_index, o_index, o_index, o_index]
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index(token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def test_token_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') indexer = PosTagIndexer(coarse_tags=True) assert indexer.token_to_indices(tokens[1], vocab) == verb_index assert indexer.token_to_indices(tokens[-1], vocab) == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.token_to_indices(tokens[1], vocab) == cop_index
def test_get_embedding_layer_uses_correct_embedding_dim(self): vocab = Vocabulary() vocab.add_token_to_namespace('word1') vocab.add_token_to_namespace('word2') embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) embedding_weights = _read_pretrained_embedding_file(embeddings_filename, 3, vocab) assert tuple(embedding_weights.size()) == (4, 3) # 4 because of padding and OOV with pytest.raises(ConfigurationError): _read_pretrained_embedding_file(embeddings_filename, 4, vocab)
def test_as_tensor_produces_integer_targets(self): vocab = Vocabulary() vocab.add_token_to_namespace("B", namespace='*labels') vocab.add_token_to_namespace("I", namespace='*labels') vocab.add_token_to_namespace("O", namespace='*labels') tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels") sequence_label_field.index(vocab) padding_lengths = sequence_label_field.get_padding_lengths() tensor = sequence_label_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
class TestDataset(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super(TestDataset, self).setUp() def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({"words": TextField([Token("hello")], {})}) with pytest.raises(ConfigurationError): _ = Batch([instance1, instance2]) def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5}, "text2": {"num_tokens": 6, "tokens_length": 6}} def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]])) def get_instances(self): field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], self.token_indexer) field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4})] return instances
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") embeddings_filename = self.TEST_DIR + "embeddings.hdf5" embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset( 'embedding', embeddings.shape, dtype='float32', data=embeddings ) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) with pytest.raises(ConfigurationError): _ = Embedding.from_params(vocab, params)
def make_vocab_from_params(params: Params, serialization_dir: str): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def test_blank_pos_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] for token in tokens: token.pos_ = "" indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no POS tag" # we convert it to "NONE" assert counter["pos_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index('NONE', 'pos_tokens') # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos") assert {"pos": [none_index, none_index, none_index, none_index]} == indices
def setUp(self): super(IteratorTest, self).setUp() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace('this') self.is_index = self.vocab.add_token_to_namespace('is') self.a_index = self.vocab.add_token_to_namespace('a') self.sentence_index = self.vocab.add_token_to_namespace('sentence') self.another_index = self.vocab.add_token_to_namespace('another') self.yet_index = self.vocab.add_token_to_namespace('yet') self.very_index = self.vocab.add_token_to_namespace('very') self.long_index = self.vocab.add_token_to_namespace('long') instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]), self.create_instance(["sentence"]), ] class LazyIterable: def __iter__(self): return (instance for instance in instances) self.instances = instances self.lazy_instances = LazyIterable()
def test_adjacency_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("a", namespace="labels") vocab.add_token_to_namespace("b", namespace="labels") vocab.add_token_to_namespace("c", namespace="labels") labels = ["a", "b"] indices = [(0, 1), (2, 1)] adjacency_field = AdjacencyField(indices, self.text, labels) adjacency_field.index(vocab) tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths()) numpy.testing.assert_equal(tensor.numpy(), numpy.array([[-1, 0, -1, -1, -1], [-1, -1, -1, -1, -1], [-1, 1, -1, -1, -1], [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1]]))
def setUp(self): self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True)) self.utterance = self.tokenizer.tokenize("where is mersin?") self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")} json = { 'question': self.utterance, 'columns': ['Name in English', 'Location in English'], 'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']] } self.graph = TableQuestionKnowledgeGraph.read_from_json(json) self.vocab = Vocabulary() self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens') self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens') self.english_index = self.vocab.add_token_to_namespace("english", namespace='tokens') self.location_index = self.vocab.add_token_to_namespace("location", namespace='tokens') self.paradeniz_index = self.vocab.add_token_to_namespace("paradeniz", namespace='tokens') self.mersin_index = self.vocab.add_token_to_namespace("mersin", namespace='tokens') self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens') self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens') self.negative_one_index = self.vocab.add_token_to_namespace("-1", namespace='tokens') self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens') self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens') self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens') self.edirne_index = self.oov_index self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) super(KnowledgeGraphFieldTest, self).setUp()
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.})) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder': # type: ignore # pylint: disable=arguments-differ params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) do_layer_norm = params.pop_bool('do_layer_norm', False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list(vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) scalar_mix_parameters = params.pop('scalar_mix_parameters', None) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache, scalar_mix_parameters=scalar_mix_parameters)
def make_vocab_from_params(params: Params): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) vocab_dir = vocab_params.get('directory_path') if vocab_dir is None: raise ConfigurationError("To use `make-vocab` your configuration must contain a value " "at vocabulary.directory_path") os.makedirs(vocab_dir, exist_ok=True) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(Params({}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) self.inputs = { "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])), "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])), "words3": Variable(torch.LongTensor([[1, 5, 1, 2]])) }
def __init__(self, vocab: Vocabulary, question_embedder: TextFieldEmbedder, action_embedding_dim: int, encoder: Seq2SeqEncoder, entity_encoder: Seq2VecEncoder, max_decoding_steps: int, use_neighbor_similarity_for_linking: bool = False, dropout: float = 0.0, num_linking_features: int = 10, rule_namespace: str = 'rule_labels', tables_directory: str = '/wikitables/') -> None: super(WikiTablesSemanticParser, self).__init__(vocab) self._question_embedder = question_embedder self._encoder = encoder self._entity_encoder = TimeDistributed(entity_encoder) self._max_decoding_steps = max_decoding_steps self._use_neighbor_similarity_for_linking = use_neighbor_similarity_for_linking if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._rule_namespace = rule_namespace self._denotation_accuracy = WikiTablesAccuracy(tables_directory) self._action_sequence_accuracy = Average() self._has_logical_form = Average() self._action_padding_index = -1 # the padding value used by IndexField num_actions = vocab.get_vocab_size(self._rule_namespace) self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim) self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim) self._action_biases = Embedding(num_embeddings=num_actions, embedding_dim=1) # This is what we pass as input in the first step of decoding, when we don't have a # previous action, or a previous question attention. self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim)) self._first_attended_question = torch.nn.Parameter(torch.FloatTensor(encoder.get_output_dim())) torch.nn.init.normal_(self._first_action_embedding) torch.nn.init.normal_(self._first_attended_question) check_dimensions_match(entity_encoder.get_output_dim(), question_embedder.get_output_dim(), "entity word average embedding dim", "question embedding dim") self._num_entity_types = 4 # TODO(mattg): get this in a more principled way somehow? self._num_start_types = 5 # TODO(mattg): get this in a more principled way somehow? self._embedding_dim = question_embedder.get_output_dim() self._type_params = torch.nn.Linear(self._num_entity_types, self._embedding_dim) self._neighbor_params = torch.nn.Linear(self._embedding_dim, self._embedding_dim) if num_linking_features > 0: self._linking_params = torch.nn.Linear(num_linking_features, 1) else: self._linking_params = None if self._use_neighbor_similarity_for_linking: self._question_entity_params = torch.nn.Linear(1, 1) self._question_neighbor_params = torch.nn.Linear(1, 1) else: self._question_entity_params = None self._question_neighbor_params = None
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")} self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, attend_feedforward: FeedForward, similarity_function: SimilarityFunction, compare_feedforward: FeedForward, aggregate_feedforward: FeedForward, premise_encoder: Optional[Seq2SeqEncoder] = None, hypothesis_encoder: Optional[Seq2SeqEncoder] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(DecomposableAttention, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._attend_feedforward = TimeDistributed(attend_feedforward) self._matrix_attention = LegacyMatrixAttention(similarity_function) self._compare_feedforward = TimeDistributed(compare_feedforward) self._aggregate_feedforward = aggregate_feedforward self._premise_encoder = premise_encoder self._hypothesis_encoder = hypothesis_encoder or premise_encoder self._num_labels = vocab.get_vocab_size(namespace="labels") check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(), "text field embedding dim", "attend feedforward input dim") check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels, "final output dimension", "number of labels") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read('tests/fixtures/data/squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab, params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace('the') vocab.add_token_to_namespace('a') params = Params({ 'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz', 'embedding_dim': 300, 'projection_dim': 20 }) embedding_layer = Embedding.from_params(vocab, params) input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def _read_embeddings_from_hdf5(embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Reads from a hdf5 formatted file. The embedding matrix is assumed to be keyed by 'embedding' and of size ``(num_tokens, embedding_dim)``. """ with h5py.File(embeddings_filename, 'r') as fin: embeddings = fin['embedding'][...] if list(embeddings.shape) != [vocab.get_vocab_size(namespace), embedding_dim]: raise ConfigurationError( "Read shape {0} embeddings from the file, but expected {1}".format( list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim])) return torch.FloatTensor(embeddings)
def test_dry_run_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False dry_run_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters")}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]]
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument return { "token_ids": [10, 15] + \ [vocabulary.get_token_index(token.text, 'words') for token in tokens] + \ [25], "additional_key": [22, 29] }
def get_fixtures(include_gold_entities=False, include_lm_labels=True, include_contextual_embeddings=False): vocab = Vocabulary.from_params( Params({ "directory_path": "tests/fixtures/kg_embeddings/tucker_wordnet/vocabulary", })) batch = { 'next_sentence_label': torch.tensor([0, 1, 1]), 'tokens': { 'tokens': torch.tensor([[16, 16, 11, 1, 1, 1, 17, 1, 1, 1], [16, 16, 1, 12, 1, 17, 1, 1, 1, 1], [16, 16, 1, 1, 17, 1, 13, 17, 17, 0]]) }, 'segment_ids': torch.tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]), 'lm_label_ids': { 'lm_labels': torch.tensor([[0, 1, 0, 0, 13, 0, 1, 1, 13, 0], [0, 0, 1, 0, 0, 2, 1, 1, 13, 0], [0, 1, 1, 0, 1, 1, 0, 0, 0, 0]]) }, 'candidates': { 'wordnet': { 'candidate_entity_priors': torch.tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]], [[0.2500, 0.2500, 0.2500, 0.2500, 0.0000], [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]], [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]), 'candidate_entities': { 'ids': torch.tensor([[[67, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[344, 349, 354, 122, 0], [101, 46, 445, 25, 28]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]]) }, 'candidate_segment_ids': torch.tensor([[0, 1], [0, 1], [0, 0]]), 'candidate_spans': torch.tensor([[[1, 1], [-1, -1]], [[1, 1], [4, 4]], [[-1, -1], [-1, -1]]]) } } } if include_gold_entities: batch['gold_entities'] = { 'wordnet': { 'ids': torch.tensor([[[67], [0]], [[349], [46]], [[0], [0]]]) } } if not include_lm_labels: del batch['next_sentence_label'] del batch['lm_label_ids'] if include_contextual_embeddings: batch_size, timesteps = batch['tokens']['tokens'].shape batch['contextual_embeddings'] = torch.rand(batch_size, timesteps, 12) batch['tokens_mask'] = batch['tokens']['tokens'] > 0 del batch['tokens'] return vocab, batch
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params( self.vocab, params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): self.inputs['words4'] = self.inputs['words3'] del self.inputs['words3'] with pytest.raises(ConfigurationError): self.token_embedder(self.inputs) self.inputs['words3'] = self.inputs['words4'] del self.inputs['words4'] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 4, 5, 6) * 20).long(), 'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12) def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, "embedder_to_indexer_map": { "words": ["words"], "elmo": ["elmo", "words"] } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs)
def index(self, vocab: Vocabulary): self._mapping_array = [ vocab.get_token_index(x.text, self._target_namespace) for x in self._source_tokens ]
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, att_question_to_choice: SimilarityFunction, question_encoder: Optional[Seq2SeqEncoder] = None, choice_encoder: Optional[Seq2SeqEncoder] = None, initializer: InitializerApplicator = InitializerApplicator(), aggregate_question: Optional[str] = "max", aggregate_choice: Optional[str] = "max", embeddings_dropout_value: Optional[float] = 0.0) -> None: super(QAMultiChoiceMaxAttention, self).__init__(vocab) self._use_cuda = (torch.cuda.is_available() and torch.cuda.current_device() >= 0) self._text_field_embedder = text_field_embedder if embeddings_dropout_value > 0.0: self._embeddings_dropout = torch.nn.Dropout( p=embeddings_dropout_value) else: self._embeddings_dropout = lambda x: x self._question_encoder = question_encoder # choices encoding self._choice_encoder = choice_encoder self._question_aggregate = aggregate_question self._choice_aggregate = aggregate_choice self._num_labels = vocab.get_vocab_size(namespace="labels") question_output_dim = self._text_field_embedder.get_output_dim() if self._question_encoder is not None: question_output_dim = self._question_encoder.get_output_dim() choice_output_dim = self._text_field_embedder.get_output_dim() if self._choice_encoder is not None: choice_output_dim = self._choice_encoder.get_output_dim() if question_output_dim != choice_output_dim: raise ConfigurationError( "Output dimension of the question_encoder (dim: {}) " "and choice_encoder (dim: {})" "must match! ".format(question_output_dim, choice_output_dim)) # Check input tensor dimensions for the question to choices attention (similarity function) if hasattr(att_question_to_choice, "tensor_1_dim"): tensor_1_dim = att_question_to_choice.tensor_1_dim if tensor_1_dim != question_output_dim: raise ConfigurationError( "Output dimension of the question_encoder (dim: {}) " "and tensor_1_dim (dim: {}) of att_question_to_choice" "must match! ".format(question_output_dim, tensor_1_dim)) if hasattr(att_question_to_choice, "tensor_2_dim"): tensor_2_dim = att_question_to_choice.tensor_2_dim if tensor_2_dim != question_output_dim: raise ConfigurationError( "Output dimension of the choice_encoder (dim: {}) " "and tensor_2_dim (dim: {}) of att_question_to_choice" "must match! ".format(choice_output_dim, tensor_2_dim)) self._matrix_attention_question_to_choice = LegacyMatrixAttention( att_question_to_choice) self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, vocab: Vocabulary, encoder: Seq2SeqEncoder, entity_encoder: Seq2VecEncoder, decoder_beam_search: BeamSearch, question_embedder: TextFieldEmbedder, schema_embedder:TextFieldEmbedder, input_attention: Attention, past_attention: Attention, max_decoding_steps: int, action_embedding_dim: int, gnn: bool = True, decoder_use_graph_entities: bool = True, decoder_self_attend: bool = True, gnn_timesteps: int = 2, parse_sql_on_decoding: bool = True, add_action_bias: bool = True, use_neighbor_similarity_for_linking: bool = True, dataset_path: str = 'dataset', training_beam_size: int = None, decoder_num_layers: int = 1, dropout: float = 0.0, rule_namespace: str = 'rule_labels', scoring_dev_params: dict = None, debug_parsing: bool = False) -> None: super().__init__(vocab) self.vocab = vocab self._encoder = encoder self._max_decoding_steps = max_decoding_steps if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._rule_namespace = rule_namespace self._question_embedder = question_embedder self._schema_embedder = schema_embedder self._add_action_bias = add_action_bias self._scoring_dev_params = scoring_dev_params or {} self.parse_sql_on_decoding = parse_sql_on_decoding self._entity_encoder = TimeDistributed(entity_encoder) self._use_neighbor_similarity_for_linking = use_neighbor_similarity_for_linking self._self_attend = decoder_self_attend self._decoder_use_graph_entities = decoder_use_graph_entities self._action_padding_index = -1 # the padding value used by IndexField self._exact_match = Average() self._sql_evaluator_match = Average() self._action_similarity = Average() self._acc_single = Average() self._acc_multi = Average() self._beam_hit = Average() self._action_embedding_dim = action_embedding_dim num_actions = vocab.get_vocab_size(self._rule_namespace) if self._add_action_bias: input_action_dim = action_embedding_dim + 1 else: input_action_dim = action_embedding_dim self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=input_action_dim) self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim) encoder_output_dim = encoder.get_output_dim() if gnn: encoder_output_dim += action_embedding_dim self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim)) self._first_attended_utterance = torch.nn.Parameter(torch.FloatTensor(encoder_output_dim)) self._first_attended_output = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim)) torch.nn.init.normal_(self._first_action_embedding) torch.nn.init.normal_(self._first_attended_utterance) torch.nn.init.normal_(self._first_attended_output) self._num_entity_types = 9 self._embedding_dim = question_embedder.get_output_dim() self._entity_type_encoder_embedding = Embedding(self._num_entity_types, self._embedding_dim) self._entity_type_decoder_embedding = Embedding(self._num_entity_types, action_embedding_dim) self._linking_params = torch.nn.Linear(16, 1) torch.nn.init.uniform_(self._linking_params.weight, 0, 1) num_edge_types = 3 self._gnn = GatedGraphConv(self._embedding_dim, gnn_timesteps, num_edge_types=num_edge_types, dropout=dropout) self._decoder_num_layers = decoder_num_layers self._beam_search = decoder_beam_search self._decoder_trainer = MaximumMarginalLikelihood(training_beam_size) if decoder_self_attend: self._transition_function = AttendPastSchemaItemsTransitionFunction(encoder_output_dim=encoder_output_dim, action_embedding_dim=action_embedding_dim, input_attention=input_attention, past_attention=past_attention, predict_start_type_separately=False, add_action_bias=self._add_action_bias, dropout=dropout, num_layers=self._decoder_num_layers) else: self._transition_function = LinkingTransitionFunction(encoder_output_dim=encoder_output_dim, action_embedding_dim=action_embedding_dim, input_attention=input_attention, predict_start_type_separately=False, add_action_bias=self._add_action_bias, dropout=dropout, num_layers=self._decoder_num_layers) self._ent2ent_ff = FeedForward(action_embedding_dim, 1, action_embedding_dim, Activation.by_name('relu')()) self._neighbor_params = torch.nn.Linear(self._embedding_dim, self._embedding_dim) # TODO: Remove hard-coded dirs self._evaluate_func = partial(evaluate, db_dir=os.path.join(dataset_path, 'database'), table=os.path.join(dataset_path, 'tables.json'), check_valid=False) self.debug_parsing = debug_parsing
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding': # type: ignore """ We need the vocabulary here to know how many items we need to embed, and we look for a ``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use. If you know beforehand exactly how many embeddings you need, or aren't using a vocabulary mapping for the things getting embedded here, then you can pass in the ``num_embeddings`` key directly, and the vocabulary will be ignored. In the configuration file, a file containing pretrained embeddings can be specified using the parameter ``"pretrained_file"``. It can be the path to a local file or an URL of a (cached) remote file. Two formats are supported: * hdf5 file - containing an embedding matrix in the form of a torch.Tensor; * text file - an utf-8 encoded text file with space separated fields:: [word] [dim 1] [dim 2] ... The text file can eventually be compressed with gzip, bz2, lzma or zip. You can even select a single file inside an archive containing multiple files using the URI:: "(archive_uri)#file_path_inside_the_archive" where ``archive_uri`` can be a file system path or a URL. For example:: "(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt" """ # pylint: disable=arguments-differ num_embeddings = params.pop_int('num_embeddings', None) vocab_namespace = params.pop("vocab_namespace", "tokens") if num_embeddings is None: num_embeddings = vocab.get_vocab_size(vocab_namespace) embedding_dim = params.pop_int('embedding_dim') pretrained_file = params.pop("pretrained_file", None) projection_dim = params.pop_int("projection_dim", None) trainable = params.pop_bool("trainable", True) padding_index = params.pop_int('padding_index', None) max_norm = params.pop_float('max_norm', None) norm_type = params.pop_float('norm_type', 2.) scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False) sparse = params.pop_bool('sparse', False) params.assert_empty(cls.__name__) if pretrained_file: # If we're loading a saved model, we don't want to actually read a pre-trained # embedding file - the embeddings will just be in our saved weights, and we might not # have the original embedding file anymore, anyway. weight = _read_pretrained_embeddings_file(pretrained_file, embedding_dim, vocab, vocab_namespace) else: weight = None return cls(num_embeddings=num_embeddings, embedding_dim=embedding_dim, projection_dim=projection_dim, weight=weight, padding_index=padding_index, trainable=trainable, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse)
def _read_embeddings_from_text_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped. The remainder of the docstring is identical to `_read_pretrained_embeddings_file`. """ tokens_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(" ") if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line, ) continue vector = numpy.asarray(fields[1:], dtype="float32") embeddings[token] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def _load( cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1 ) -> "Model": """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, "vocabulary") # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice) vocab = vocab_class.from_files( vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token") ) model_params = config.get("model") training_params = config.get("trainer", Params({})) opt_level = training_params.get("opt_level") # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() # If the model was trained with amp and amp is available, we should re-initialize it with # the opt_level that was used. If the model was trained with amp but amp is not availble, log a warning # so this doesn't pass silently. if opt_level is not None: if amp is None: logger.warning( ( f"This model was trained with amp (opt_level: {opt_level}) but amp is not available." " Any further training or inference will happen at full-precision." ) ) else: model = amp.initialize(model, opt_level=opt_level) # If vocab+embedding extension was done, the model initialized from from_params # and one defined by state dict in weights_file might not have same embedding shapes. # Eg. when model embedder module was transferred along with vocab extension, the # initialized embedding weight shape would be smaller than one in the state_dict. # So calling model embedding extension is required before load_state_dict. # If vocab and model embeddings are in sync, following would be just a no-op. model.extend_embedder_vocab() model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) return model
class TestTextField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace(u"sentence", namespace=u'words') self.vocab.add_token_to_namespace(u"A", namespace=u'words') self.vocab.add_token_to_namespace(u"A", namespace=u'characters') self.vocab.add_token_to_namespace(u"s", namespace=u'characters') self.vocab.add_token_to_namespace(u"e", namespace=u'characters') self.vocab.add_token_to_namespace(u"n", namespace=u'characters') self.vocab.add_token_to_namespace(u"t", namespace=u'characters') self.vocab.add_token_to_namespace(u"c", namespace=u'characters') super(TestTextField, self).setUp() def test_field_counts_vocab_items_correctly(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts[u"words"][u"This"] == 1 assert namespace_token_counts[u"words"][u"is"] == 1 assert namespace_token_counts[u"words"][u"a"] == 1 assert namespace_token_counts[u"words"][u"sentence"] == 1 assert namespace_token_counts[u"words"][u"."] == 1 assert list(namespace_token_counts.keys()) == [u"words"] field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts[u"characters"][u"T"] == 1 assert namespace_token_counts[u"characters"][u"h"] == 1 assert namespace_token_counts[u"characters"][u"i"] == 2 assert namespace_token_counts[u"characters"][u"s"] == 3 assert namespace_token_counts[u"characters"][u"a"] == 1 assert namespace_token_counts[u"characters"][u"e"] == 3 assert namespace_token_counts[u"characters"][u"n"] == 2 assert namespace_token_counts[u"characters"][u"t"] == 1 assert namespace_token_counts[u"characters"][u"c"] == 1 assert namespace_token_counts[u"characters"][u"."] == 1 assert list(namespace_token_counts.keys()) == [u"characters"] field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words"), u"characters": TokenCharactersIndexer(u"characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts[u"characters"][u"T"] == 1 assert namespace_token_counts[u"characters"][u"h"] == 1 assert namespace_token_counts[u"characters"][u"i"] == 2 assert namespace_token_counts[u"characters"][u"s"] == 3 assert namespace_token_counts[u"characters"][u"a"] == 1 assert namespace_token_counts[u"characters"][u"e"] == 3 assert namespace_token_counts[u"characters"][u"n"] == 2 assert namespace_token_counts[u"characters"][u"t"] == 1 assert namespace_token_counts[u"characters"][u"c"] == 1 assert namespace_token_counts[u"characters"][u"."] == 1 assert namespace_token_counts[u"words"][u"This"] == 1 assert namespace_token_counts[u"words"][u"is"] == 1 assert namespace_token_counts[u"words"][u"a"] == 1 assert namespace_token_counts[u"words"][u"sentence"] == 1 assert namespace_token_counts[u"words"][u"."] == 1 assert set(namespace_token_counts.keys()) == set([u"words", u"characters"]) def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace(u"sentence", namespace=u'words') capital_a_index = vocab.add_token_to_namespace(u"A", namespace=u'words') capital_a_char_index = vocab.add_token_to_namespace(u"A", namespace=u'characters') s_index = vocab.add_token_to_namespace(u"s", namespace=u'characters') e_index = vocab.add_token_to_namespace(u"e", namespace=u'characters') n_index = vocab.add_token_to_namespace(u"n", namespace=u'characters') t_index = vocab.add_token_to_namespace(u"t", namespace=u'characters') c_index = vocab.add_token_to_namespace(u"c", namespace=u'characters') field = TextField([Token(t) for t in [u"A", u"sentence"]], {u"words": SingleIdTokenIndexer(namespace=u"words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens[u"words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in [u"A", u"sentence"]], {u"characters": TokenCharactersIndexer(namespace=u"characters")}) field1.index(vocab) assert field1._indexed_tokens[u"characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in [u"A", u"sentence"]], token_indexers={u"words": SingleIdTokenIndexer(namespace=u"words"), u"characters": TokenCharactersIndexer(namespace=u"characters")}) field2.index(vocab) assert field2._indexed_tokens[u"words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens[u"characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] # pylint: enable=protected-access def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) with pytest.raises(ConfigurationError): field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {u"num_tokens": 5} field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {u"num_tokens": 5, u"num_token_characters": 8} field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters"), u"words": SingleIdTokenIndexer(u"words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {u"num_tokens": 5, u"num_token_characters": 8} def test_as_tensor_handles_words(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])) def test_as_tensor_handles_longer_lengths(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths[u"num_tokens"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0])) def test_as_tensor_handles_characters(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(), expected_character_array) def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField([Token(t) for t in [u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words"), u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths[u"num_tokens"] = 5 padding_lengths[u"num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in [u"A", u"sentence"]], {u"words": SingleIdTokenIndexer(namespace=u"words")}) print(field) def test_token_embedder_returns_dict(self): field = TextField([Token(t) for t in [u"A", u"sentence"]], token_indexers={u"field_with_dict": DictReturningTokenIndexer(), u"words": SingleIdTokenIndexer(u"words"), u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { u'token_ids': 5, u'additional_key': 2, u'words': 2, u'characters': 2, u'num_token_characters': 8 } padding_lengths[u'token_ids'] = 7 padding_lengths[u'additional_key'] = 3 padding_lengths[u'words'] = 4 padding_lengths[u'characters'] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors[u'token_ids'].shape) == [7] assert list(tensors[u'additional_key'].shape) == [3] assert list(tensors[u'words'].shape) == [4] assert list(tensors[u'characters'].shape) == [4, 8]
## Modify these parameters so that I am not f****d in memory in my litle servergb cf_a.datareader_lazy = True # Force lazyness for RAM optimization cf_a.batch_size_train = 30 cf_a.batch_size_validation = 30 cf_a.force_free_batch_memory = False max_instances_in_memory = 1000 print_conf_params(cf_a) folder_images+= "Eta_"+str(cf_a.eta_KL) + "_DOr_" + str(cf_a.spans_output_dropout) + \ "_sigma_" + str(round(np.exp(cf_a.VB_span_end_predictor_linear_prior["log_sigma1"]),3)) """ ################################################################## ############ INSTANTIATE DATAREADER AND LOAD DATASET ############ ################################################################## """ vocab = Vocabulary() """ ######################################################## ################# INSTANTIATE THE MODEL ################### """ if (Experiments_instantiate_model): print("Initializing Model architecture") model = BidirectionalAttentionFlow_1(vocab, cf_a) print("Loading previous model") model.load_state_dict(torch.load(model_file_path)) #model.trim_model(4) def plots_weights_layer(mu_W, sigma_W,
# coding=utf-8 # @Author: 莫冉 # @Date: 2020-08-08 """ 测试获取allennlp词表文件 """ from pathlib import Path from allennlp.data import Vocabulary basename = "/home/zs261988/" save_path = "data/vocab/bert_vocabulary" vocab_file = "models/ptms/albert_void_tiny/vocab.txt" # vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]") # # # # # # 加载bert词表 # vocab.set_from_file(Path(basename) / vocab_file, oov_token="[UNK]") # # # # vocab.save_to_files(Path(basename) / save_path) # # 加载之前保存到词表 vocab = Vocabulary.from_files(Path(basename) / save_path, padding_token="[PAD]", oov_token="[UNK]") print("oov_token: ", vocab._oov_token, vocab.get_token_index(vocab._oov_token)) print("padding_token: ", vocab._padding_token, vocab.get_token_index(vocab._padding_token))
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, question_encoder: Optional[Seq2SeqEncoder] = None, choice_encoder: Optional[Seq2SeqEncoder] = None, initializer: InitializerApplicator = InitializerApplicator(), aggregate_question: Optional[str] = "max", aggregate_choice: Optional[str] = "max", embeddings_dropout_value: Optional[float] = 0.0, share_encoders: Optional[bool] = False, choices_init_from_question_states: Optional[bool] = False, use_choice_sum_instead_of_question: Optional[bool] = False, params=Params) -> None: super(QAMultiChoice_OneVsRest_Choices_v1, self).__init__(vocab) # TO DO: AllenNLP does not support statefull RNNS yet.. init_is_supported = False if not init_is_supported and (choices_init_from_question_states): raise ValueError( "choices_init_from_question_states=True or facts_init_from_question_states=True are not supported yet!" ) else: self._choices_init_from_question_states = choices_init_from_question_states self._use_cuda = (torch.cuda.is_available() and torch.cuda.current_device() >= 0) self._return_question_to_choices_att = False self._use_choice_sum_instead_of_question = use_choice_sum_instead_of_question self._params = params self._text_field_embedder = text_field_embedder if embeddings_dropout_value > 0.0: self._embeddings_dropout = torch.nn.Dropout( p=embeddings_dropout_value) else: self._embeddings_dropout = lambda x: x self._question_encoder = question_encoder # choices encoding self._choice_encoder = choice_encoder self._question_aggregate = aggregate_question self._choice_aggregate = aggregate_choice self._num_labels = vocab.get_vocab_size(namespace="labels") question_output_dim = self._text_field_embedder.get_output_dim() if self._question_encoder is not None: question_output_dim = self._question_encoder.get_output_dim() choice_output_dim = self._text_field_embedder.get_output_dim() if self._choice_encoder is not None: choice_output_dim = self._choice_encoder.get_output_dim() if question_output_dim != choice_output_dim: raise ConfigurationError( "Output dimension of the question_encoder (dim: {}), " "plus choice_encoder (dim: {})" "must match! ".format(question_output_dim, choice_output_dim)) # question to choice attention att_question_to_choice_params = params.get("att_question_to_choice") if "tensor_1_dim" in att_question_to_choice_params: att_question_to_choice_params = update_params( att_question_to_choice_params, { "tensor_1_dim": question_output_dim, "tensor_2_dim": choice_output_dim }) self._matrix_attention_question_to_choice = LegacyMatrixAttention( SimilarityFunction.from_params(att_question_to_choice_params)) self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def setup_method(self): super().setup_method() self.reader = TransformerSquadReader(length_limit=50, stride=10) self.vocab = Vocabulary() self.model = TransformerQA(self.vocab) self.predictor = TransformerQAPredictor(self.model, self.reader)
def __init__(self, vocab: Vocabulary, mention_feedforward: FeedForward, relation_feedforward: FeedForward, feature_size: int, spans_per_word: float, span_emb_dim: int, use_biaffine_rel: bool, rel_prop: int = 0, rel_prop_dropout_A: float = 0.0, rel_prop_dropout_f: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), positive_label_weight: float = 1.0, regularizer: Optional[RegularizerApplicator] = None) -> None: super(RelationExtractor, self).__init__(vocab, regularizer) # Need to hack this for cases where there's no relation data. It breaks Ulme's code. self._n_labels = max(vocab.get_vocab_size("relation_labels"), 1) # Span candidate scorer. # TODO(dwadden) make sure I've got the input dim right on this one. feedforward_scorer = torch.nn.Sequential( TimeDistributed(mention_feedforward), TimeDistributed( torch.nn.Linear(mention_feedforward.get_output_dim(), 1))) self._mention_pruner = Pruner(feedforward_scorer) # Relation scorer. self._use_biaffine_rel = use_biaffine_rel if self._use_biaffine_rel: self._biaffine = torch.nn.Linear(span_emb_dim, span_emb_dim) else: self._relation_feedforward = relation_feedforward self._relation_scorer = torch.nn.Linear( relation_feedforward.get_output_dim(), self._n_labels) self._spans_per_word = spans_per_word # TODO(dwadden) Add code to compute relation F1. # self._candidate_recall = CandidateRecall() self._relation_metrics = RelationMetrics1() class_weights = torch.cat([ torch.tensor([1.0]), positive_label_weight * torch.ones(self._n_labels) ]) self._loss = torch.nn.CrossEntropyLoss(reduction="sum", ignore_index=-1, weight=class_weights) self.rel_prop = rel_prop # Relation Propagation self._A_network = FeedForward(input_dim=self._n_labels, num_layers=1, hidden_dims=span_emb_dim, activations=lambda x: x, dropout=rel_prop_dropout_A) self._f_network = FeedForward(input_dim=2 * span_emb_dim, num_layers=1, hidden_dims=span_emb_dim, activations=torch.nn.Sigmoid(), dropout=rel_prop_dropout_f) initializer(self)
def __init__( self, vocab: Vocabulary, trigger_feedforward: FeedForward, trigger_candidate_feedforward: FeedForward, mention_feedforward: FeedForward, # Used if entity beam is off. argument_feedforward: FeedForward, context_attention: BilinearMatrixAttention, trigger_attention: Seq2SeqEncoder, span_prop: SpanProp, cls_projection: FeedForward, feature_size: int, trigger_spans_per_word: float, argument_spans_per_word: float, loss_weights, trigger_attention_context: bool, event_args_use_trigger_labels: bool, event_args_use_ner_labels: bool, event_args_label_emb: int, shared_attention_context: bool, label_embedding_method: str, event_args_label_predictor: str, event_args_gold_candidates: bool = False, # If True, use gold argument candidates. context_window: int = 0, softmax_correction: bool = False, initializer: InitializerApplicator = InitializerApplicator(), positive_label_weight: float = 1.0, entity_beam: bool = False, regularizer: Optional[RegularizerApplicator] = None) -> None: super(EventExtractor, self).__init__(vocab, regularizer) self._n_ner_labels = vocab.get_vocab_size("ner_labels") self._n_trigger_labels = vocab.get_vocab_size("trigger_labels") self._n_argument_labels = vocab.get_vocab_size("argument_labels") # Embeddings for trigger labels and ner labels, to be used by argument scorer. # These will be either one-hot encodings or learned embeddings, depending on "kind". self._ner_label_emb = make_embedder(kind=label_embedding_method, num_embeddings=self._n_ner_labels, embedding_dim=event_args_label_emb) self._trigger_label_emb = make_embedder( kind=label_embedding_method, num_embeddings=self._n_trigger_labels, embedding_dim=event_args_label_emb) self._label_embedding_method = label_embedding_method # Weight on trigger labeling and argument labeling. self._loss_weights = loss_weights # Trigger candidate scorer. null_label = vocab.get_token_index("", "trigger_labels") assert null_label == 0 # If not, the dummy class won't correspond to the null label. self._trigger_scorer = torch.nn.Sequential( TimeDistributed(trigger_feedforward), TimeDistributed( torch.nn.Linear(trigger_feedforward.get_output_dim(), self._n_trigger_labels - 1))) self._trigger_attention_context = trigger_attention_context if self._trigger_attention_context: self._trigger_attention = trigger_attention # Make pruners. If `entity_beam` is true, use NER and trigger scorers to construct the beam # and only keep candidates that the model predicts are actual entities or triggers. self._mention_pruner = make_pruner( mention_feedforward, entity_beam=entity_beam, gold_beam=event_args_gold_candidates) self._trigger_pruner = make_pruner(trigger_candidate_feedforward, entity_beam=entity_beam, gold_beam=False) # Argument scorer. self._event_args_use_trigger_labels = event_args_use_trigger_labels # If True, use trigger labels. self._event_args_use_ner_labels = event_args_use_ner_labels # If True, use ner labels to predict args. assert event_args_label_predictor in [ "hard", "softmax", "gold" ] # Method for predicting labels at test time. self._event_args_label_predictor = event_args_label_predictor self._event_args_gold_candidates = event_args_gold_candidates # If set to True, then construct a context vector from a bilinear attention over the trigger # / argument pair embeddings and the text. self._context_window = context_window # If greater than 0, concatenate context as features. self._argument_feedforward = argument_feedforward self._argument_scorer = torch.nn.Linear( argument_feedforward.get_output_dim(), self._n_argument_labels) # Distance embeddings. self._num_distance_buckets = 10 # Just use 10 which is the default. self._distance_embedding = Embedding(self._num_distance_buckets, feature_size) # Class token projection. self._cls_projection = cls_projection self._cls_n_triggers = torch.nn.Linear( self._cls_projection.get_output_dim(), 5) self._cls_event_types = torch.nn.Linear( self._cls_projection.get_output_dim(), self._n_trigger_labels - 1) self._trigger_spans_per_word = trigger_spans_per_word self._argument_spans_per_word = argument_spans_per_word # Context attention for event argument scorer. self._shared_attention_context = shared_attention_context if self._shared_attention_context: self._shared_attention_context_module = context_attention # Span propagation object. # TODO(dwadden) initialize with `from_params` instead if this ends up working. self._span_prop = span_prop self._span_prop._trig_arg_embedder = self._compute_trig_arg_embeddings self._span_prop._argument_scorer = self._compute_argument_scores # Softmax correction parameters. self._softmax_correction = softmax_correction self._softmax_log_temp = torch.nn.Parameter( torch.zeros([1, 1, 1, self._n_argument_labels])) self._softmax_log_multiplier = torch.nn.Parameter( torch.zeros([1, 1, 1, self._n_argument_labels])) # TODO(dwadden) Add metrics. self._metrics = EventMetrics() self._argument_stats = ArgumentStats() self._trigger_loss = torch.nn.CrossEntropyLoss(reduction="sum") # TODO(dwadden) add loss weights. self._argument_loss = torch.nn.CrossEntropyLoss(reduction="sum", ignore_index=-1) initializer(self)
def run_evaluation(evaluation_file, model_archive_file, is_wordnet_and_wiki=False): archive = load_archive(model_archive_file) params = archive.config vocab = Vocabulary.from_params(params.pop('vocabulary')) model = archive.model #model.cuda() model.eval() if is_wordnet_and_wiki: reader_params = Params({ "type": "aida_wiki_linking", "entity_disambiguation_only": False, "entity_indexer": { "type": "characters_tokenizer", "namespace": "entity_wiki", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" } } }, "extra_candidate_generators": { "wordnet": { "type": "wordnet_mention_generator", "entity_file": "s3://allennlp/knowbert/wordnet/entities.jsonl" } }, "should_remap_span_indices": True, "token_indexers": { "tokens": { "type": "bert-pretrained", "do_lowercase": True, "max_pieces": 512, "pretrained_model": "bert-base-uncased", "use_starting_offsets": True, } } }) else: reader_params = Params({ "type": "aida_wiki_linking", "entity_disambiguation_only": False, "token_indexers": { "tokens": { "type": "bert-pretrained", "pretrained_model": "bert-base-uncased", "do_lowercase": True, "use_starting_offsets": True, "max_pieces": 512, }, }, "entity_indexer": { "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "entity", }, "should_remap_span_indices": True, }) if is_wordnet_and_wiki: cg_params = Params({ "type": "bert_tokenizer_and_candidate_generator", "bert_model_type": "bert-base-uncased", "do_lower_case": True, "entity_candidate_generators": { "wordnet": { "type": "wordnet_mention_generator", "entity_file": "s3://allennlp/knowbert/wordnet/entities.jsonl" } }, "entity_indexers": { "wordnet": { "type": "characters_tokenizer", "namespace": "entity_wordnet", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" } } } } }) candidate_generator = TokenizerAndCandidateGenerator.from_params( cg_params) reader = DatasetReader.from_params(Params(reader_params)) iterator = DataIterator.from_params( Params({ "type": "basic", "batch_size": 16 })) iterator.index_with(vocab) instances = reader.read(evaluation_file) for batch_no, batch in enumerate( iterator(instances, shuffle=False, num_epochs=1)): b = move_to_device(batch, -1) b['candidates'] = { 'wiki': { 'candidate_entities': b.pop('candidate_entities'), 'candidate_entity_priors': b.pop('candidate_entity_prior'), 'candidate_segment_ids': b.pop('candidate_segment_ids'), 'candidate_spans': b.pop('candidate_spans') } } gold_entities = b.pop('gold_entities') b['gold_entities'] = {'wiki': gold_entities} if is_wordnet_and_wiki: extra_candidates = b.pop('extra_candidates') seq_len = b['tokens']['tokens'].shape[1] bbb = [] for e in extra_candidates: for k in e.keys(): e[k]['candidate_segment_ids'] = [0] * len( e[k]['candidate_spans']) ee = { 'tokens': ['[CLS]'] * seq_len, 'segment_ids': [0] * seq_len, 'candidates': e } ee_fields = candidate_generator.convert_tokens_candidates_to_fields( ee) bbb.append(Instance(ee_fields)) eb = Batch(bbb) eb.index_instances(vocab) padding_lengths = eb.get_padding_lengths() tensor_dict = eb.as_tensor_dict(padding_lengths) b['candidates'].update(tensor_dict['candidates']) bb = move_to_device(b, -1) else: bb = b loss = model(**bb) if batch_no % 100 == 0: print(model.get_metrics()) print(model.get_metrics())
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) self.token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): # Total mismatch self.inputs['words4'] = self.inputs['words3'] del self.inputs['words3'] with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("Mismatched token keys") self.inputs['words3'] = self.inputs['words4'] # Text field has too many inputs with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("is generating more keys") del self.inputs['words4'] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { 'words': (torch.rand(3, 4, 5, 6) * 20).long(), 'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12) def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]} }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs) def test_forward_runs_with_non_bijective_mapping_with_null(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": { # ignore `word_inputs` in `ElmoTokenEmbedder.forward` "elmo": ["elmo", None] } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs) def test_forward_runs_with_non_bijective_mapping_with_dict(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": { # pass arguments to `ElmoTokenEmbedder.forward` by dict "elmo": { "inputs": "elmo", "word_inputs": "words" }, "words": ["words"] } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs) def test_old_from_params_new_from_params(self): old_params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) # Allow loading the parameters in the old format with pytest.warns(DeprecationWarning): old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab) new_params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) # But also allow loading the parameters in the new format new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab) assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys() assert new_embedder(self.inputs).size() == (1, 4, 10)
class TestListField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) super(TestListField, self).setUp() def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == { "num_fields": 3, "list_words_length": 5, "list_num_tokens": 5 } def test_list_field_can_handle_empty_text_fields(self): list_field = ListField( [self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]])) def test_list_field_can_handle_empty_index_fields(self): list_field = ListField( [self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]])) def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField([ self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field ]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])) def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal( tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0])) def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField( [LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField( [LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField( [nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal( tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]]) def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_words_length"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) def test_as_tensor_can_handle_multiple_token_indexers(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])) numpy.testing.assert_array_almost_equal( characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal( characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) numpy.testing.assert_array_almost_equal( characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields( self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField( [self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal( characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal( characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field) def test_sequence_methods(self): list_field = ListField([self.field1, self.field2, self.field3]) assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field ] == [self.field1, self.field2, self.field3]
def extend_vocab( self, extended_vocab: Vocabulary, vocab_namespace: str = None, extension_pretrained_file: str = None, model_path: str = None, ): """ Extends the embedding matrix according to the extended vocabulary. If extension_pretrained_file is available, it will be used for initializing the new words embeddings in the extended vocabulary; otherwise we will check if _pretrained_file attribute is already available. If none is available, they will be initialized with xavier uniform. # Parameters extended_vocab : `Vocabulary` Vocabulary extended from original vocabulary used to construct this `Embedding`. vocab_namespace : `str`, (optional, default=`None`) In case you know what vocab_namespace should be used for extension, you can pass it. If not passed, it will check if vocab_namespace used at the time of `Embedding` construction is available. If so, this namespace will be used or else extend_vocab will be a no-op. extension_pretrained_file : `str`, (optional, default=`None`) A file containing pretrained embeddings can be specified here. It can be the path to a local file or an URL of a (cached) remote file. Check format details in `from_params` of `Embedding` class. model_path : `str`, (optional, default=`None`) Path traversing the model attributes upto this embedding module. Eg. "_text_field_embedder.token_embedder_tokens". This is only useful to give a helpful error message when extend_vocab is implicitly called by train or any other command. """ # Caveat: For allennlp v0.8.1 and below, we weren't storing vocab_namespace as an attribute, # knowing which is necessary at time of embedding vocab extension. So old archive models are # currently unextendable. vocab_namespace = vocab_namespace or self._vocab_namespace if not vocab_namespace: # It's not safe to default to "tokens" or any other namespace. logging.info( "Loading a model trained before embedding extension was implemented; " "pass an explicit vocab namespace if you want to extend the vocabulary." ) return extended_num_embeddings = extended_vocab.get_vocab_size( vocab_namespace) if extended_num_embeddings == self.num_embeddings: # It's already been extended. No need to initialize / read pretrained file in first place (no-op) return if extended_num_embeddings < self.num_embeddings: raise ConfigurationError( f"Size of namespace, {vocab_namespace} for extended_vocab is smaller than " f"embedding. You likely passed incorrect vocab or namespace for extension." ) # Case 1: user passed extension_pretrained_file and it's available. if extension_pretrained_file and is_url_or_existing_file( extension_pretrained_file): # Don't have to do anything here, this is the happy case. pass # Case 2: user passed extension_pretrained_file and it's not available elif extension_pretrained_file: raise ConfigurationError( f"You passed pretrained embedding file {extension_pretrained_file} " f"for model_path {model_path} but it's not available.") # Case 3: user didn't pass extension_pretrained_file, but pretrained_file attribute was # saved during training and is available. elif is_url_or_existing_file(self._pretrained_file): extension_pretrained_file = self._pretrained_file # Case 4: no file is available, hope that pretrained embeddings weren't used in the first place and warn else: extra_info = (f"Originally pretrained_file was at " f"{self._pretrained_file}. " if self._pretrained_file else "") # It's better to warn here and not give error because there is no way to distinguish between # whether pretrained-file wasn't used during training or user forgot to pass / passed incorrect # mapping. Raising an error would prevent fine-tuning in the former case. logging.warning( f"Embedding at model_path, {model_path} cannot locate the pretrained_file. " f"{extra_info} If you are fine-tuning and want to use using pretrained_file for " f"embedding extension, please pass the mapping by --embedding-sources argument." ) embedding_dim = self.weight.data.shape[-1] if not extension_pretrained_file: extra_num_embeddings = extended_num_embeddings - self.num_embeddings extra_weight = torch.FloatTensor(extra_num_embeddings, embedding_dim) torch.nn.init.xavier_uniform_(extra_weight) else: # It's easiest to just reload the embeddings for the entire vocab, # then only keep the ones we need. whole_weight = _read_pretrained_embeddings_file( extension_pretrained_file, embedding_dim, extended_vocab, vocab_namespace) extra_weight = whole_weight[self.num_embeddings:, :] device = self.weight.data.device extended_weight = torch.cat( [self.weight.data, extra_weight.to(device)], dim=0) self.weight = torch.nn.Parameter( extended_weight, requires_grad=self.weight.requires_grad) self.num_embeddings = extended_num_embeddings
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super().setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 }, } }) self.token_embedder = BasicTextFieldEmbedder.from_params( vocab=self.vocab, params=params) self.inputs = { "words1": { "tokens": torch.LongTensor([[0, 2, 3, 5]]) }, "words2": { "tokens": torch.LongTensor([[1, 4, 3, 2]]) }, "words3": { "tokens": torch.LongTensor([[1, 5, 1, 2]]) }, } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): # Total mismatch self.inputs["words4"] = self.inputs["words3"] del self.inputs["words3"] with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("Mismatched token keys") self.inputs["words3"] = self.inputs["words4"] # Text field has too many inputs with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("Mismatched token keys") del self.inputs["words4"] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2 }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15 }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { "words": { "tokens": (torch.rand(3, 4, 5, 6) * 20).long() }, "characters": { "token_characters": (torch.rand(3, 4, 5, 6, 7) * 15).long() }, } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12) def test_forward_runs_with_forward_params(self): class FakeEmbedder(torch.nn.Module): def __init__(self): super().__init__() def forward(self, tokens: torch.Tensor, extra_arg: int = None): assert tokens is not None assert extra_arg is not None return tokens token_embedder = BasicTextFieldEmbedder({"elmo": FakeEmbedder()}) inputs = {"elmo": {"tokens": (torch.rand(3, 6, 5) * 2).long()}} kwargs = {"extra_arg": 1} token_embedder(inputs, **kwargs) def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / "elmo" options_file = str(elmo_fixtures_path / "options.json") weight_file = str(elmo_fixtures_path / "lm_weights.hdf5") params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2 }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file, }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { "words": { "tokens": (torch.rand(3, 6) * 20).long() }, "elmo": { "tokens": (torch.rand(3, 6, 50) * 15).long() }, } token_embedder(inputs) def test_forward_runs_with_non_bijective_mapping_with_null(self): elmo_fixtures_path = self.FIXTURES_ROOT / "elmo" options_file = str(elmo_fixtures_path / "options.json") weight_file = str(elmo_fixtures_path / "lm_weights.hdf5") params = Params({ "token_embedders": { "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = {"elmo": {"tokens": (torch.rand(3, 6, 50) * 15).long()}} token_embedder(inputs) def test_forward_runs_with_non_bijective_mapping_with_dict(self): elmo_fixtures_path = self.FIXTURES_ROOT / "elmo" options_file = str(elmo_fixtures_path / "options.json") weight_file = str(elmo_fixtures_path / "lm_weights.hdf5") params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2 }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file, }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { "words": { "tokens": (torch.rand(3, 6) * 20).long() }, "elmo": { "tokens": (torch.rand(3, 6, 50) * 15).long() }, } token_embedder(inputs) def test_forward_runs_with_bijective_and_non_bijective_mapping(self): params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased" }, "token_characters": { "type": "character_encoding", "embedding": { "embedding_dim": 5 }, "encoder": { "type": "cnn", "embedding_dim": 5, "num_filters": 5, "ngram_filter_sizes": [5], }, }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { "bert": { "token_ids": (torch.rand(3, 5) * 10).long(), "mask": (torch.rand(3, 5) * 1).bool(), }, "token_characters": { "token_characters": (torch.rand(3, 5, 5) * 1).long() }, } token_embedder(inputs)
def __init__( self, embedding_dim: int, num_embeddings: int = None, projection_dim: int = None, weight: torch.FloatTensor = None, padding_index: int = None, trainable: bool = True, max_norm: float = None, norm_type: float = 2.0, scale_grad_by_freq: bool = False, sparse: bool = False, vocab_namespace: str = "tokens", pretrained_file: str = None, vocab: Vocabulary = None, ) -> None: super().__init__() if num_embeddings is None and vocab is None: raise ConfigurationError( "Embedding must be constructed with either num_embeddings or a vocabulary." ) if num_embeddings is None: num_embeddings = vocab.get_vocab_size(vocab_namespace) else: # If num_embeddings is present, set default namespace to None so that extend_vocab # call doesn't misinterpret that some namespace was originally used. vocab_namespace = None self.num_embeddings = num_embeddings self.padding_index = padding_index self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq self.sparse = sparse self._vocab_namespace = vocab_namespace self._pretrained_file = pretrained_file self.output_dim = projection_dim or embedding_dim if weight is not None and pretrained_file: raise ConfigurationError( "Embedding was constructed with both a weight and a pretrained file." ) elif pretrained_file is not None: if vocab is None: raise ConfigurationError( "To construct an Embedding from a pretrained file, you must also pass a vocabulary." ) # If we're loading a saved model, we don't want to actually read a pre-trained # embedding file - the embeddings will just be in our saved weights, and we might not # have the original embedding file anymore, anyway. # TODO: having to pass tokens here is SUPER gross, but otherwise this breaks the # extend_vocab method, which relies on the value of vocab_namespace being None # to infer at what stage the embedding has been constructed. Phew. weight = _read_pretrained_embeddings_file( pretrained_file, embedding_dim, vocab, vocab_namespace or "tokens") self.weight = torch.nn.Parameter(weight, requires_grad=trainable) elif weight is not None: self.weight = torch.nn.Parameter(weight, requires_grad=trainable) else: weight = torch.FloatTensor(num_embeddings, embedding_dim) self.weight = torch.nn.Parameter(weight, requires_grad=trainable) torch.nn.init.xavier_uniform_(self.weight) # Whatever way we have constructed the embedding, it should be consistent with # num_embeddings and embedding_dim. if self.weight.size() != (num_embeddings, embedding_dim): raise ConfigurationError( "A weight matrix was passed with contradictory embedding shapes." ) if self.padding_index is not None: self.weight.data[self.padding_index].fill_(0) if projection_dim: self._projection = torch.nn.Linear(embedding_dim, projection_dim) else: self._projection = None
def test_start_and_end_tokens(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace="characters") # 2 vocab.add_token_to_namespace("s", namespace="characters") # 3 vocab.add_token_to_namespace("e", namespace="characters") # 4 vocab.add_token_to_namespace("n", namespace="characters") # 5 vocab.add_token_to_namespace("t", namespace="characters") # 6 vocab.add_token_to_namespace("c", namespace="characters") # 7 vocab.add_token_to_namespace("<", namespace="characters") # 8 vocab.add_token_to_namespace(">", namespace="characters") # 9 vocab.add_token_to_namespace("/", namespace="characters") # 10 indexer = TokenCharactersIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1) indices = indexer.tokens_to_indices([Token("sentential")], vocab) assert indices == { "token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]] }
from allennlp.commands.train import train_model_from_args from mtl.dataset_readers.MTLDatasetReader import MTLDatasetReader from mtl.models.MTLSharedClassifier import MTLSharedClassifier reader = MTLDatasetReader(token_indexers={ 'tokens': SingleIdTokenIndexer(lowercase_tokens=True), 'elmo': ELMoTokenCharactersIndexer() }, max_sequence_len=100) books_train_dataset = reader.read('./data/mtl-dataset/books.task.train') books_validation_dataset = reader.read('./data/mtl-dataset/books.task.test') imdb_train_dataset = reader.read('./data/mtl-dataset/imdb.task.train') imdb_test_dataset = reader.read('./data/mtl-dataset/imdb.task.test') vocab = Vocabulary.from_instances(books_train_dataset + books_validation_dataset) iterator = BucketIterator(batch_size=128, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) print(vocab._index_to_token) # print(vocab.__getstate__()['_token_to_index']['labels']) # for batch in itera tor(books_train_dataset, num_epochs=1, shuffle=True): # print(batch['tokens']['tokens'], batch['label']) print(iterator.get_num_batches(books_train_dataset)) books_iter = iter(iterator._create_batches(books_train_dataset, shuffle=True)) print(len(books_train_dataset)) print(next(books_iter).as_tensor_dict()) '''
# These five lines control all the major sources of randomness. np.random.seed(_C.RANDOM_SEED) torch.manual_seed(_C.RANDOM_SEED) torch.cuda.manual_seed_all(_C.RANDOM_SEED) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Set device according to specified GPU ids. device = torch.device( f"cuda:{_A.gpu_ids[0]}" if _A.gpu_ids[0] >= 0 else "cpu") # -------------------------------------------------------------------------------------------- # INSTANTIATE VOCABULARY, DATALOADER, MODEL, OPTIMIZER # -------------------------------------------------------------------------------------------- vocabulary = Vocabulary.from_files(_C.DATA.VOCABULARY) # If we wish to use CBS during evaluation or inference, expand the vocabulary and add # constraint words derived from Open Images classes. if _C.MODEL.USE_CBS: vocabulary = add_constraint_words_to_vocabulary( vocabulary, wordforms_tsvpath=_C.DATA.CBS.WORDFORMS) train_dataset = TrainingDataset.from_config(_C, vocabulary=vocabulary, in_memory=_A.in_memory) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE, shuffle=True, num_workers=_A.cpu_workers,
def __init__( self, vocab: Vocabulary, bert_model: Union[str, BertModel], span_extractor: SpanExtractor, tree_mapper: TreeMapper, domain_utils: DomainUtils, is_weak_supervision: bool, feedforward: FeedForward = None, dropout: float = 0.0, num_labels: int = None, index: str = "bert", label_namespace: str = "labels", trainable: bool = True, initializer: InitializerApplicator = InitializerApplicator(), denotation_based_metric: Metric = None, token_based_metric: Metric = None, **kwargs, ) -> None: super().__init__(vocab, **kwargs) if isinstance(bert_model, str): self.bert_model = PretrainedBertModel.load(bert_model) else: self.bert_model = bert_model for param in self.bert_model.parameters(): param.requires_grad = trainable in_features = self.bert_model.config.hidden_size self._label_namespace = label_namespace self.span_extractor = span_extractor self.feedforward_layer = TimeDistributed(feedforward) if feedforward else None self.num_classes = self.vocab.get_vocab_size("labels") if feedforward is not None: output_dim = feedforward.get_output_dim() else: output_dim = span_extractor.get_output_dim() self.tag_projection_layer = TimeDistributed(Linear(output_dim, self.num_classes)) if num_labels: out_features = num_labels else: out_features = vocab.get_vocab_size(namespace=self._label_namespace) self._dropout = torch.nn.Dropout(p=dropout) self._tree_mapper = tree_mapper labels = self.vocab.get_index_to_token_vocabulary(self._label_namespace) grammar = Grammar(labels) self._cky = CKY(grammar, tree_mapper, domain_utils) use_lexicon = True if use_lexicon: self.zero_shot_extractor = ZeroShotExtractor(labels, domain_utils) self._sim_weight = torch.nn.Parameter( torch.ones([1], dtype=torch.float32, requires_grad=True)) self._classification_layer = torch.nn.Linear(in_features, out_features) self._accuracy = CategoricalAccuracy() self._accuracy_all_no_span = CategoricalAccuracy() self._fmeasure = F1Measure(positive_label=1) self._denotation_based_metric = denotation_based_metric self._token_based_metric = token_based_metric self._loss = torch.nn.CrossEntropyLoss() self._index = index initializer(self._classification_layer) self._epoch_counter = 0 self._is_weak_supervision = is_weak_supervision if self._is_weak_supervision: self._weak_supervision_acc = WeakSupervisionAccuracy() self._label_preparer = LabelsPreparer(self.vocab.get_index_to_token_vocabulary(self._label_namespace)) self._sets_f1_metric = SetsF1() self._compute_spans_f1 = False
def load_lm_data(fold=None, mode='train'): """ Turns the sequential data into instances. :param split: :return: """ # Get or make vocab spacy_model = get_spacy_model("en_core_web_sm", pos_tags=False, parse=False, ner=False) if os.path.exists('vocabulary'): print( "Loading cached vocab. caution if you're building the dataset again!!!!", flush=True) vocab = Vocabulary.from_files('vocabulary') with open(os.path.join(DATA_PATH, 'events-3.json'), 'r') as f: lm_data = json.load(f) lm_data = [ data_item for s in ('train', 'val', 'test') for data_item in lm_data[s] ] else: assert fold is None with open(os.path.join(DATA_PATH, 'events-3.json'), 'r') as f: lm_data = json.load(f) lm_data = [ data_item for s in ('train', 'val', 'test') for data_item in lm_data[s] ] # Manually doing this because I don't want to double count things vocab = Vocabulary.from_instances([ Instance({ 'story': TextField( [ Token(x) for x in ['@@bos@@'] + [x.orth_ for x in spacy_model(sent)] + ['@@eos@@'] ], token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True) }) }) for data_item in lm_data for sent in data_item['sentences'] ], min_count={'tokens': 3}) vocab.get_index_to_token_vocabulary('tokens') vocab.save_to_files('vocabulary') print("VOCABULARY HAS {} ITEMS".format( vocab.get_vocab_size(namespace='tokens'))) if all([ os.path.exists('lm-{}-of-{}.pkl'.format(i, NUM_FOLDS)) for i in range(NUM_FOLDS) ]): print("LOADING CACHED DATASET", flush=True) if mode == 'val': with open('lm-{}-of-{}.pkl'.format(fold, NUM_FOLDS), 'rb') as f: print("Loading split{} for {}".format(fold, mode)) instances = pkl.load(f) else: instances = [] for other_fold in range(NUM_FOLDS): if other_fold != fold: with open('lm-{}-of-{}.pkl'.format(other_fold, NUM_FOLDS), 'rb') as f: print("Loading split{} for {}".format( other_fold, mode)) instances += pkl.load(f) return instances, vocab print("MAKING THE DATASET", flush=True) assert fold is None for item in tqdm(lm_data): item['sentences_tokenized'] = [[st.orth_ for st in spacy_model(sent)] for sent in item['sentences']] def _to_instances(data): # flatten this instances = [] for item in data: for s1, s2 in pairwise(item['sentences_tokenized']): instances.append(( Instance({ 'story': TextField( [ Token(x) for x in ['@@bos@@'] + s1 + s2 + ['@@eos@@'] ], token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True) }) }), s1, s2, item, )) return instances random.seed(123456) random.shuffle(lm_data) all_sets = [] for fold_ in range(NUM_FOLDS): val_set = _to_instances( lm_data[len(lm_data) * fold_ // NUM_FOLDS:len(lm_data) * (fold_ + 1) // NUM_FOLDS]) with open('lm-{}-of-{}.pkl'.format(fold_, NUM_FOLDS), 'wb') as f: pkl.dump(val_set, f) all_sets.extend(val_set) return all_sets, vocab
class TestDataset(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super().setUp() def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({"words": TextField([Token("hello")], {})}) with pytest.raises(ConfigurationError): _ = Batch([instance1, instance2]) def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == { "text1": { "num_tokens": 5, "tokens_length": 5 }, "text2": { "num_tokens": 6, "tokens_length": 6 } } def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal( text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]])) def get_instances(self): field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([ Token(t) for t in ["this", "is", "a", "different", "sentence", "."] ], self.token_indexer) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [ Instance({ "text1": field1, "text2": field2 }), Instance({ "text1": field3, "text2": field4 }) ] return instances
def build_vocab(instances: Iterable[Instance]) -> Vocabulary: print("Building the vocabulary") return Vocabulary.from_instances(instances)
def _read_pretrained_word2vec_format_embedding_file( embeddings_filename: str, # pylint: disable=invalid-name embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read from a gzipped-word2vec format file. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... The remainder of the docstring is identical to ``_read_pretrained_embedding_file``. """ words_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected %d, was %d): %s ...", embedding_dim, len(fields) - 1, line[:15]) continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug( "Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix
def test_min_padding_length(self): sentence = "AllenNLP is awesome ." tokens = [Token(token) for token in sentence.split(" ")] vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace="characters") # 2 vocab.add_token_to_namespace("l", namespace="characters") # 3 vocab.add_token_to_namespace("e", namespace="characters") # 4 vocab.add_token_to_namespace("n", namespace="characters") # 5 vocab.add_token_to_namespace("N", namespace="characters") # 6 vocab.add_token_to_namespace("L", namespace="characters") # 7 vocab.add_token_to_namespace("P", namespace="characters") # 8 vocab.add_token_to_namespace("i", namespace="characters") # 9 vocab.add_token_to_namespace("s", namespace="characters") # 10 vocab.add_token_to_namespace("a", namespace="characters") # 11 vocab.add_token_to_namespace("w", namespace="characters") # 12 vocab.add_token_to_namespace("o", namespace="characters") # 13 vocab.add_token_to_namespace("m", namespace="characters") # 14 vocab.add_token_to_namespace(".", namespace="characters") # 15 indexer = TokenCharactersIndexer("characters", min_padding_length=10) indices = indexer.tokens_to_indices(tokens, vocab) padded = indexer.as_padded_tensor_dict( indices, indexer.get_padding_lengths(indices)) assert padded["token_characters"].tolist() == [ [2, 3, 3, 4, 5, 6, 7, 8, 0, 0], [9, 10, 0, 0, 0, 0, 0, 0, 0, 0], [11, 12, 4, 10, 13, 14, 4, 0, 0, 0], [15, 0, 0, 0, 0, 0, 0, 0, 0, 0], ]