def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, 'vocab_test.txt'), 'r') as fin: tokens = fin.read().strip().split('\n') indexer = ELMoTokenCharactersIndexer() indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens] # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): sentences.append( indexer.pad_token_sequence( indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={} ) ) batch = torch.from_numpy(numpy.array(sentences)) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output['token_embedding'], elmo_token_embedder_output['mask'] )[0].data.numpy() actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, 'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, 'r') as fin: expected_embeddings = fin['embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.fixtures_path, 'vocab_test.txt'), 'r') as fin: tokens = fin.read().strip().split('\n') indexer = ELMoTokenCharactersIndexer() indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens] # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): sentences.append( indexer.pad_token_sequence( indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={} ) ) batch = Variable(torch.from_numpy(numpy.array(sentences))) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output['token_embedding'], elmo_token_embedder_output['mask'] )[0].data.numpy() actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.fixtures_path, 'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, 'r') as fin: expected_embeddings = fin['embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def __init__(self): from allennlp.modules.elmo import _ElmoCharacterEncoder if not path.isdir(self.path('elmo')): makedirs(self.path('elmo')) self.fweights = self.ensure_file(path.join('elmo', 'weights.hdf5'), url=self.settings['weights']) self.foptions = self.ensure_file(path.join('elmo', 'options.json'), url=self.settings['options']) self.embeddings = _ElmoCharacterEncoder(self.foptions, self.fweights)
def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) for correct_index, token in [[0, '<S>'], [2, '</S>']]: indices = indexer.token_to_indices(Token(token), Vocabulary()) indices = torch.from_numpy(numpy.array(indices)).view(1, 1, -1) embeddings = elmo_token_embedder(indices)['token_embedding'] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) for correct_index, token in [[0, '<S>'], [2, '</S>']]: indices = indexer.tokens_to_indices([Token(token)], Vocabulary(), "correct") indices = torch.from_numpy(numpy.array(indices["correct"])).view(1, 1, -1) embeddings = elmo_token_embedder(indices)['token_embedding'] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def __init__( self, options_file: str, weight_file: str, requires_grad: bool = False, vocab_to_cache: List[str] = None, ) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder( options_file, weight_file, requires_grad=requires_grad ) self._requires_grad = requires_grad if requires_grad and vocab_to_cache: logging.warning( "You are fine tuning ELMo and caching char CNN word vectors. " "This behaviour is not guaranteed to be well defined, particularly. " "if not all of your inputs will occur in the vocabulary cache." ) # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None self._bos_embedding: torch.Tensor = None self._eos_embedding: torch.Tensor = None if vocab_to_cache: logging.info("Caching character cnn layers for words in vocabulary.") # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding. # They are set in the method so they can be accessed from outside the # constructor. self.create_cached_cnn_embeddings(vocab_to_cache) with open(cached_path(options_file), "r") as fin: options = json.load(fin) if not options["lstm"].get("use_skip_connections"): raise ConfigurationError("We only support pretrained biLMs with residual connections") self._elmo_lstm = ElmoLstm( input_size=options["lstm"]["projection_dim"], hidden_size=options["lstm"]["projection_dim"], cell_size=options["lstm"]["dim"], num_layers=options["lstm"]["n_layers"], memory_cell_clip_value=options["lstm"]["cell_clip"], state_projection_clip_value=options["lstm"]["proj_clip"], requires_grad=requires_grad, ) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options["lstm"]["n_layers"] + 1
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, "vocab_test.txt"), "r") as fin: words = fin.read().strip().split("\n") vocab = Vocabulary() indexer = ELMoTokenCharactersIndexer() tokens = [Token(word) for word in words] indices = indexer.tokens_to_indices(tokens, vocab) # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): char_indices = indices["elmo_tokens"][(k * 50):((k + 1) * 50)] sentences.append( indexer.as_padded_tensor_dict( {"elmo_tokens": char_indices}, padding_lengths={"elmo_tokens": 50})["elmo_tokens"]) batch = torch.stack(sentences) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output["token_embedding"], elmo_token_embedder_output["mask"])[0].data.numpy() actual_embeddings = actual_embeddings.reshape( -1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, "elmo_token_embeddings.hdf5") with h5py.File(embedding_file, "r") as fin: expected_embeddings = fin["embedding"][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False, vocab_to_cache: List[str] = None, combine_method="weighted-sum", random_init=False) -> None: super(ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder( options_file, weight_file, requires_grad=requires_grad) self._requires_grad = requires_grad self._word_embedding = None self._bos_embedding: torch.Tensor = None self._eos_embedding: torch.Tensor = None with open(cached_path(options_file), "r") as fin: options = json.load(fin) if not options["lstm"].get("use_skip_connections"): raise ConfigurationError( "We only support pretrained biLMs with residual connections") self._elmo_lstm = LatticeElmoLstm( input_size=options["lstm"]["projection_dim"], hidden_size=options["lstm"]["projection_dim"], cell_size=options["lstm"]["dim"], num_layers=options["lstm"]["n_layers"], memory_cell_clip_value=options["lstm"]["cell_clip"], state_projection_clip_value=options["lstm"]["proj_clip"], requires_grad=requires_grad, combine_method=combine_method) if not random_init: self._elmo_lstm.load_weights(weight_file) else: print("WARNING!!! ELMo weights will not be loaded!!!") # Number of representation layers including context independent layer self.num_layers = options["lstm"]["n_layers"] + 1
def main( vocab_path: str, elmo_config_path: str, elmo_weights_path: str, output_dir: str, batch_size: int, device: int, use_custom_oov_token: bool = False, ): """ Creates ELMo word representations from a vocabulary file. These word representations are _independent_ - they are the result of running the CNN and Highway layers of the ELMo model, but not the Bidirectional LSTM. ELMo requires 2 additional tokens: <S> and </S>. The first token in this file is assumed to be an unknown token. This script produces two artifacts: A new vocabulary file with the <S> and </S> tokens inserted and a glove formatted embedding file containing word : vector pairs, one per line, with all values separated by a space. """ # Load the vocabulary words and convert to char ids with open(vocab_path, "r") as vocab_file: tokens = vocab_file.read().strip().split("\n") # Insert the sentence boundary tokens which elmo uses at positions 1 and 2. if tokens[0] != DEFAULT_OOV_TOKEN and not use_custom_oov_token: raise ConfigurationError( "ELMo embeddings require the use of a OOV token.") tokens = [tokens[0]] + ["<S>", "</S>"] + tokens[1:] indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary())["tokens"] sentences = [] for k in range((len(indices) // 50) + 1): sentences.append( indexer.as_padded_tensor_dict(indices[(k * 50):((k + 1) * 50)], padding_lengths={"tokens": 50})) last_batch_remainder = 50 - (len(indices) % 50) if device != -1: elmo_token_embedder = _ElmoCharacterEncoder( elmo_config_path, elmo_weights_path).cuda(device) else: elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path) all_embeddings = [] for i in range((len(sentences) // batch_size) + 1): batch = torch.stack(sentences[i * batch_size:(i + 1) * batch_size]) if device != -1: batch = batch.cuda(device) token_embedding = elmo_token_embedder(batch)["token_embedding"].data # Reshape back to a list of words of shape (batch_size * 50, encoding_dim) # We also need to remove the <S>, </S> tokens appended by the encoder. per_word_embeddings = (token_embedding[:, 1:-1, :].contiguous().view( -1, token_embedding.size(-1))) all_embeddings.append(per_word_embeddings) # Remove the embeddings associated with padding in the last batch. all_embeddings[-1] = all_embeddings[-1][:-last_batch_remainder, :] embedding_weight = torch.cat(all_embeddings, 0).cpu().numpy() # Write out the embedding in a glove format. os.makedirs(output_dir, exist_ok=True) with gzip.open(os.path.join(output_dir, "elmo_embeddings.txt.gz"), "wb") as embeddings_file: for i, word in enumerate(tokens): string_array = " ".join( str(x) for x in list(embedding_weight[i, :])) embeddings_file.write(f"{word} {string_array}\n".encode("utf-8")) # Write out the new vocab with the <S> and </S> tokens. _, vocab_file_name = os.path.split(vocab_path) with open(os.path.join(output_dir, vocab_file_name), "w") as new_vocab_file: for word in tokens: new_vocab_file.write(f"{word}\n")
def __init__(self): super(Model, self).__init__() options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file)
def test_elmo_character_encoder_with_allennlp(): allennlp_embedder = _ElmoCharacterEncoder( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ) embedder = ElmoCharacterEncoderFactory( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ).create() allennlp_parameters = [ '_char_embedding_weights', 'char_conv_0.bias', 'char_conv_0.weight', 'char_conv_1.bias', 'char_conv_1.weight', 'char_conv_2.bias', 'char_conv_2.weight', 'char_conv_3.bias', 'char_conv_3.weight', 'char_conv_4.bias', 'char_conv_4.weight', '_projection.bias', '_projection.weight', ] embedder_parameters = [ 'char_embedding.weight', 'char_conv_0.bias', 'char_conv_0.weight', 'char_conv_1.bias', 'char_conv_1.weight', 'char_conv_2.bias', 'char_conv_2.weight', 'char_conv_3.bias', 'char_conv_3.weight', 'char_conv_4.bias', 'char_conv_4.weight', 'output_proj.bias', 'output_proj.weight', ] allennlp_parameters_diff = [ '_highways._layers.0.bias', '_highways._layers.0.weight', '_highways._layers.1.bias', '_highways._layers.1.weight', ] embedder_parameters_diff = [ 'highway.layers_0.bias', 'highway.layers_0.weight', 'highway.layers_1.bias', 'highway.layers_1.weight', ] assert len(allennlp_parameters) == len(embedder_parameters) assert len(allennlp_parameters_diff) == len(embedder_parameters_diff) allennlp_embedder_named_parameters = dict( allennlp_embedder.named_parameters()) # Same. for allennlp_param, embedder_param in zip(allennlp_parameters, embedder_parameters): allennlp_w = allennlp_embedder_named_parameters[allennlp_param].data embedder_w = embedder.named_parameters()[embedder_param].data np.testing.assert_array_equal(embedder_w.numpy(), allennlp_w.numpy()) assert embedder_w.dtype == allennlp_w.dtype # Diff on highway. for allennlp_param, embedder_param in zip(allennlp_parameters_diff, embedder_parameters_diff): allennlp_w = allennlp_embedder_named_parameters[allennlp_param].data embedder_w = embedder.named_parameters()[embedder_param].data assert embedder_w.dtype == allennlp_w.dtype np.testing.assert_raises( AssertionError, np.testing.assert_array_equal, embedder_w.numpy(), allennlp_w.numpy(), ) sentences = [ ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'], ['The', 'sentence', '.'], ] # `(2, 7, 50)` character_ids = _sentences_to_ids(sentences) # AllenNLP. out = allennlp_embedder(character_ids) allennlp_token_embedding, _ = remove_sentence_boundaries( out['token_embedding'], out['mask']) assert list(allennlp_token_embedding.shape) == [2, 7, 16] # Ours. inputs = pack_padded_sequence(character_ids, [7, 3], batch_first=True) out = embedder(inputs.data) ours_token_embedding = _unpack(out, inputs.batch_sizes) assert list(ours_token_embedding.shape) == [2, 7, 16] np.testing.assert_array_almost_equal( ours_token_embedding.data.numpy(), allennlp_token_embedding.data.numpy(), )
def main(vocab_path: str, elmo_config_path: str, elmo_weights_path: str, output_dir: str, batch_size: int, device: int, use_custom_oov_token: bool = False): """ Creates ELMo word representations from a vocabulary file. These word representations are _independent_ - they are the result of running the CNN and Highway layers of the ELMo model, but not the Bidirectional LSTM. ELMo requires 2 additional tokens: <S> and </S>. The first token in this file is assumed to be an unknown token. This script produces two artifacts: A new vocabulary file with the <S> and </S> tokens inserted and a glove formatted embedding file containing word : vector pairs, one per line, with all values separated by a space. """ # Load the vocabulary words and convert to char ids with open(vocab_path, 'r') as vocab_file: tokens = vocab_file.read().strip().split('\n') # Insert the sentence boundary tokens which elmo uses at positions 1 and 2. if tokens[0] != DEFAULT_OOV_TOKEN and not use_custom_oov_token: raise ConfigurationError("ELMo embeddings require the use of a OOV token.") tokens = [tokens[0]] + ["<S>", "</S>"] + tokens[1:] indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary(), "indices")["indices"] sentences = [] for k in range((len(indices) // 50) + 1): sentences.append(indexer.pad_token_sequence(indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={})) last_batch_remainder = 50 - (len(indices) % 50) if device != -1: elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path).cuda(device) else: elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path) all_embeddings = [] for i in range((len(sentences) // batch_size) + 1): array = numpy.array(sentences[i * batch_size: (i + 1) * batch_size]) if device != -1: batch = torch.from_numpy(array).cuda(device) else: batch = torch.from_numpy(array) token_embedding = elmo_token_embedder(batch)['token_embedding'].data # Reshape back to a list of words of shape (batch_size * 50, encoding_dim) # We also need to remove the <S>, </S> tokens appended by the encoder. per_word_embeddings = token_embedding[:, 1:-1, :].contiguous().view(-1, token_embedding.size(-1)) all_embeddings.append(per_word_embeddings) # Remove the embeddings associated with padding in the last batch. all_embeddings[-1] = all_embeddings[-1][:-last_batch_remainder, :] embedding_weight = torch.cat(all_embeddings, 0).cpu().numpy() # Write out the embedding in a glove format. os.makedirs(output_dir, exist_ok=True) with gzip.open(os.path.join(output_dir, "elmo_embeddings.txt.gz"), 'wb') as embeddings_file: for i, word in enumerate(tokens): string_array = " ".join([str(x) for x in list(embedding_weight[i, :])]) embeddings_file.write(f"{word} {string_array}\n".encode('utf-8')) # Write out the new vocab with the <S> and </S> tokens. _, vocab_file_name = os.path.split(vocab_path) with open(os.path.join(output_dir, vocab_file_name), "w") as new_vocab_file: for word in tokens: new_vocab_file.write(f"{word}\n")
def main( vocab_path: str, elmo_config_path: str, elmo_weights_path: str, output_dir: str, batch_size: int, device: int, use_custom_oov_token: bool = False, ): with open(vocab_path, "r") as vocab_file: tokens = vocab_file.read().strip().split("\n") if tokens[0] != DEFAULT_OOV_TOKEN and not use_custom_oov_token: raise ConfigurationError("ELMo embeddings require the use of a OOV token.") tokens = [tokens[0]] + ["<S>", "</S>"] + tokens[1:] indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary())["tokens"] sentences = [] for k in range((len(indices) // 50) + 1): sentences.append( indexer.as_padded_tensor_dict( indices[(k * 50) : ((k + 1) * 50)], padding_lengths={"tokens": 50} ) ) last_batch_remainder = 50 - (len(indices) % 50) if device != -1: elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path).cuda( device ) else: elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path) all_embeddings = [] for i in range((len(sentences) // batch_size) + 1): batch = torch.stack(sentences[i * batch_size : (i + 1) * batch_size]) if device != -1: batch = batch.cuda(device) token_embedding = elmo_token_embedder(batch)["token_embedding"].data per_word_embeddings = ( token_embedding[:, 1:-1, :].contiguous().view(-1, token_embedding.size(-1)) ) all_embeddings.append(per_word_embeddings) all_embeddings[-1] = all_embeddings[-1][:-last_batch_remainder, :] embedding_weight = torch.cat(all_embeddings, 0).cpu().numpy() os.makedirs(output_dir, exist_ok=True) with gzip.open(os.path.join(output_dir, "elmo_embeddings.txt.gz"), "wb") as embeddings_file: for i, word in enumerate(tokens): string_array = " ".join(str(x) for x in list(embedding_weight[i, :])) embeddings_file.write(f"{word} {string_array}\n".encode("utf-8")) _, vocab_file_name = os.path.split(vocab_path) with open(os.path.join(output_dir, vocab_file_name), "w") as new_vocab_file: for word in tokens: new_vocab_file.write(f"{word}\n")