def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder( options_file, weight_file, requires_grad=requires_grad) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError( 'We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm( input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False, vocab_to_cache: List[str] = None) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder( options_file, weight_file, requires_grad=False if vocab_to_cache is not None else requires_grad) self._requires_grad = requires_grad if requires_grad and vocab_to_cache: logging.warning( "You are fine tuning ELMo and caching char CNN word vectors. " "This behaviour is not guaranteed to be well defined, particularly. " "if not all of your inputs will occur in the vocabulary cache. " "_ElmoCharacterEncoder will be frozen because " "it is not used after word embedding caching.") # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None self._bos_embedding: torch.Tensor = None self._eos_embedding: torch.Tensor = None if vocab_to_cache: logging.info( "Caching character cnn layers for words in vocabulary.") # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding. # They are set in the method so they can be accessed from outside the # constructor. self.create_cached_cnn_embeddings(vocab_to_cache) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError( 'We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm( input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1
def test_elmo_lstm(self): input_tensor = torch.rand(4, 5, 3) input_tensor[1, 4:, :] = 0. input_tensor[2, 2:, :] = 0. input_tensor[3, 1:, :] = 0. mask = torch.ones([4, 5]) mask[1, 4:] = 0. mask[2, 2:] = 0. mask[3, 1:] = 0. lstm = ElmoLstm(num_layers=2, input_size=3, hidden_size=5, cell_size=7, memory_cell_clip_value=2, state_projection_clip_value=1) output_sequence = lstm(input_tensor, mask) # Check all the layer outputs are masked properly. numpy.testing.assert_array_equal( output_sequence.data[:, 1, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal( output_sequence.data[:, 2, 2:, :].numpy(), 0.0) numpy.testing.assert_array_equal( output_sequence.data[:, 3, 1:, :].numpy(), 0.0) # LSTM state should be (num_layers, batch_size, hidden_size) assert list(lstm._states[0].size()) == [2, 4, 10] # LSTM memory cell should be (num_layers, batch_size, cell_size) assert list((lstm._states[1].size())) == [2, 4, 14]
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False, vocab_to_cache: List[str] = None) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad) self._requires_grad = requires_grad if requires_grad and vocab_to_cache: logging.warning("You are fine tuning ELMo and caching char CNN word vectors. " "This behaviour is not guaranteed to be well defined, particularly. " "if not all of your inputs will occur in the vocabulary cache.") # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None self._bos_embedding: torch.Tensor = None self._eos_embedding: torch.Tensor = None if vocab_to_cache: logging.info("Caching character cnn layers for words in vocabulary.") # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding. # They are set in the method so they can be accessed from outside the # constructor. self.create_cached_cnn_embeddings(vocab_to_cache) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError('We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError('We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1
class _ElmoBiLm(torch.nn.Module): u""" Run a pre-trained bidirectional language model, outputing the activations at each layer for weighting together into an ELMo representation (with ``allennlp.modules.seq2seq_encoders.Elmo``). This is a lower level class, useful for advanced uses, but most users should use ``allennlp.modules.seq2seq_encoders.Elmo`` directly. Parameters ---------- options_file : ``str`` ELMo JSON options file weight_file : ``str`` ELMo hdf5 weight file requires_grad: ``bool``, optional If True, compute gradient of ELMo parameters for fine tuning. vocab_to_cache : ``List[str]``, optional, (default = 0.5). A list of words to pre-compute and cache character convolutions for. If you use this option, _ElmoBiLm expects that you pass word indices of shape (batch_size, timesteps) to forward, instead of character indices. If you use this option and pass a word which wasn't pre-cached, this will break. """ def __init__(self, options_file , weight_file , requires_grad = False, vocab_to_cache = None) : super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad) self._requires_grad = requires_grad if requires_grad and vocab_to_cache: logging.warning(u"You are fine tuning ELMo and caching char CNN word vectors. " u"This behaviour is not guaranteed to be well defined, particularly. " u"if not all of your inputs will occur in the vocabulary cache.") # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None self._bos_embedding = None self._eos_embedding = None if vocab_to_cache: logging.info(u"Caching character cnn layers for words in vocabulary.") # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding. # They are set in the method so they can be accessed from outside the # constructor. self.create_cached_cnn_embeddings(vocab_to_cache) with open(cached_path(options_file), u'r') as fin: options = json.load(fin) if not options[u'lstm'].get(u'use_skip_connections'): raise ConfigurationError(u'We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm(input_size=options[u'lstm'][u'projection_dim'], hidden_size=options[u'lstm'][u'projection_dim'], cell_size=options[u'lstm'][u'dim'], num_layers=options[u'lstm'][u'n_layers'], memory_cell_clip_value=options[u'lstm'][u'cell_clip'], state_projection_clip_value=options[u'lstm'][u'proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options[u'lstm'][u'n_layers'] + 1 def get_output_dim(self): return 2 * self._token_embedder.get_output_dim() def forward(self, # pylint: disable=arguments-differ inputs , word_inputs = None) : u""" Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: ``'activations'``: ``List[torch.Tensor]`` A list of activations at each layer of the network, each of shape ``(batch_size, timesteps + 2, embedding_dim)`` ``'mask'``: ``torch.Tensor`` Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask. Note that the output tensors all include additional special begin and end of sequence markers. """ if self._word_embedding is not None and word_inputs is not None: try: mask_without_bos_eos = (word_inputs > 0).long() # The character cnn part is cached - just look it up. embedded_inputs = self._word_embedding(word_inputs) # type: ignore # shape (batch_size, timesteps + 2, embedding_dim) type_representation, mask = add_sentence_boundary_token_ids( embedded_inputs, mask_without_bos_eos, self._bos_embedding, self._eos_embedding ) except RuntimeError: # Back off to running the character convolutions, # as we might not have the words in the cache. token_embedding = self._token_embedder(inputs) mask = token_embedding[u'mask'] type_representation = token_embedding[u'token_embedding'] else: token_embedding = self._token_embedder(inputs) mask = token_embedding[u'mask'] type_representation = token_embedding[u'token_embedding'] lstm_outputs = self._elmo_lstm(type_representation, mask) # Prepare the output. The first layer is duplicated. # Because of minor differences in how masking is applied depending # on whether the char cnn layers are cached, we'll be defensive and # multiply by the mask here. It's not strictly necessary, as the # mask passed on is correct, but the values in the padded areas # of the char cnn representations can change. output_tensors = [ torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1) ] for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0): output_tensors.append(layer_activations.squeeze(0)) return { u'activations': output_tensors, u'mask': mask, } def create_cached_cnn_embeddings(self, tokens ) : u""" Given a list of tokens, this method precomputes word representations by running just the character convolutions and highway layers of elmo, essentially creating uncontextual word vectors. On subsequent forward passes, the word ids are looked up from an embedding, rather than being computed on the fly via the CNN encoder. This function sets 3 attributes: _word_embedding : ``torch.Tensor`` The word embedding for each word in the tokens passed to this method. _bos_embedding : ``torch.Tensor`` The embedding for the BOS token. _eos_embedding : ``torch.Tensor`` The embedding for the EOS token. Parameters ---------- tokens : ``List[str]``, required. A list of tokens to precompute character convolutions for. """ tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens timesteps = 32 batch_size = 32 chunked_tokens = lazy_groups_of(iter(tokens), timesteps) all_embeddings = [] device = get_device_of(next(self.parameters())) for batch in lazy_groups_of(chunked_tokens, batch_size): # Shape (batch_size, timesteps, 50) batched_tensor = batch_to_ids(batch) # NOTE: This device check is for when a user calls this method having # already placed the model on a device. If this is called in the # constructor, it will probably happen on the CPU. This isn't too bad, # because it's only a few convolutions and will likely be very fast. if device >= 0: batched_tensor = batched_tensor.cuda(device) output = self._token_embedder(batched_tensor) token_embedding = output[u"token_embedding"] mask = output[u"mask"] token_embedding, _ = remove_sentence_boundaries(token_embedding, mask) all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1))) full_embedding = torch.cat(all_embeddings, 0) # We might have some trailing embeddings from padding in the batch, so # we clip the embedding and lookup to the right size. full_embedding = full_embedding[:len(tokens), :] embedding = full_embedding[2:len(tokens), :] vocab_size, embedding_dim = list(embedding.size()) from allennlp.modules.token_embedders import Embedding # type: ignore self._bos_embedding = full_embedding[0, :] self._eos_embedding = full_embedding[1, :] self._word_embedding = Embedding(vocab_size, # type: ignore embedding_dim, weight=embedding.data, trainable=self._requires_grad, padding_index=0)
class ElmoBilmDebias(torch.nn.Module): """ This is a customized version of the elmo bilm + debiasing at the first layer """ def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False, vocab_to_cache: List[str] = None) -> None: super(ElmoBilmDebias, self).__init__() self._token_embedder = _ElmoCharacterEncoder( options_file, weight_file, requires_grad=requires_grad) self._requires_grad = requires_grad if requires_grad and vocab_to_cache: logging.warning( "You are fine tuning ELMo and caching char CNN word vectors. " "This behaviour is not guaranteed to be well defined, particularly. " "if not all of your inputs will occur in the vocabulary cache." ) # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None self._bos_embedding: torch.Tensor = None self._eos_embedding: torch.Tensor = None if vocab_to_cache: logging.info( "Caching character cnn layers for words in vocabulary.") # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding. # They are set in the method so they can be accessed from outside the # constructor. self.create_cached_cnn_embeddings(vocab_to_cache) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError( 'We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm( input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1 def get_output_dim(self): return 2 * self._token_embedder.get_output_dim() def forward( self, # pylint: disable=arguments-differ inputs: torch.Tensor, bias: torch.Tensor = None, num_bias: int = 1, contraction: (torch.Tensor, torch.Tensor) = None, word_inputs: torch.Tensor = None ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: ``'activations'``: ``List[torch.Tensor]`` A list of activations at each layer of the network, each of shape ``(batch_size, timesteps + 2, embedding_dim)`` ``'mask'``: ``torch.Tensor`` Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask. Note that the output tensors all include additional special begin and end of sequence markers. """ if self._word_embedding is not None and word_inputs is not None: try: mask_without_bos_eos = (word_inputs > 0).long() # The character cnn part is cached - just look it up. embedded_inputs = self._word_embedding( word_inputs) # type: ignore # shape (batch_size, timesteps + 2, embedding_dim) type_representation, mask = add_sentence_boundary_token_ids( embedded_inputs, mask_without_bos_eos, self._bos_embedding, self._eos_embedding) except RuntimeError: # Back off to running the character convolutions, # as we might not have the words in the cache. token_embedding = self._token_embedder(inputs) mask = token_embedding['mask'] type_representation = token_embedding['token_embedding'] else: token_embedding = self._token_embedder(inputs) mask = token_embedding['mask'] type_representation = token_embedding['token_embedding'] # debiasing the input embeddings # 1. take out the boundaries, i.e. len - 2 batch_l, seq_l, elmo_size = type_representation.shape l0 = type_representation[:, 1:-1, :] # 2. debiasing if bias is not None: if num_bias == 1: bias = bias.expand(batch_l, 1, elmo_size) proj = l0.bmm(bias.transpose(1, 2)) l0 = l0 - (proj * bias) elif num_bias == 2: bias = bias.expand(batch_l, 2, elmo_size) bias1 = bias[:, 0:1, :] bias2 = bias[:, 1:2, :] proj1 = l0.bmm(bias1.transpose(1, 2)) proj2 = l0.bmm(bias2.transpose(1, 2)) l0 = l0 - (proj1 * bias1) - (proj2 * bias2) else: raise Exception('unrecognized num_bias: {0}'.format(num_bias)) # 3. contraction if contraction is not None: if not hasattr(self, 'contract_U'): v1 = contraction[0].view(-1, elmo_size).cpu().numpy() v2 = contraction[1].view(-1, elmo_size).cpu().numpy() v1, v2 = maxSpan(v1, v2) U = np.identity(elmo_size) U = gsConstrained(U, v1, basis(np.vstack((v1, v2)))) self.contract_v1 = torch.from_numpy(v1).view(1, 1, elmo_size) self.contract_v2 = torch.from_numpy(v2).view(1, 1, elmo_size) self.contract_U = torch.from_numpy(U).view( 1, elmo_size, elmo_size).float() gpuid = contraction[0].get_device() if gpuid != -1: self.contract_v1 = self.contract_v1.cuda(gpuid) self.contract_v2 = self.contract_v2.cuda(gpuid) self.contract_U = self.contract_U.cuda(gpuid) opt = Holder() opt.gpuid = contraction[0].get_device() l0 = correction(opt, self.contract_U, self.contract_v1, self.contract_v2, l0.contiguous()) # 4. reconcat with boundaries type_representation = torch.cat([ type_representation[:, 0:1, :], l0, type_representation[:, -1:, :] ], 1) # continue the lm lstm_outputs = self._elmo_lstm(type_representation, mask) # Prepare the output. The first layer is duplicated. # Because of minor differences in how masking is applied depending # on whether the char cnn layers are cached, we'll be defensive and # multiply by the mask here. It's not strictly necessary, as the # mask passed on is correct, but the values in the padded areas # of the char cnn representations can change. output_tensors = [ torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1) ] for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0): output_tensors.append(layer_activations.squeeze(0)) return { 'activations': output_tensors, 'mask': mask, }
class _ElmoBiLm(torch.nn.Module): """ Run a pre-trained bidirectional language model, outputing the activations at each layer for weighting together into an ELMo representation (with ``allennlp.modules.seq2seq_encoders.Elmo``). This is a lower level class, useful for advanced uses, but most users should use ``allennlp.modules.seq2seq_encoders.Elmo`` directly. Parameters ---------- options_file : ``str`` ELMo JSON options file weight_file : ``str`` ELMo hdf5 weight file requires_grad: ``bool``, optional If True, compute gradient of ELMo parameters for fine tuning. """ def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError('We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1 def forward(self, # pylint: disable=arguments-differ inputs: torch.Tensor) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs: ``torch.autograd.Variable`` Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. Returns ------- Dict with keys: ``'activations'``: ``List[torch.autograd.Variable]`` A list of activations at each layer of the network, each of shape ``(batch_size, timesteps + 2, embedding_dim)`` ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask. Note that the output tensors all include additional special begin and end of sequence markers. """ token_embedding = self._token_embedder(inputs) type_representation = token_embedding['token_embedding'] mask = token_embedding['mask'] lstm_outputs = self._elmo_lstm(type_representation, mask) # Prepare the output. The first layer is duplicated. output_tensors = [ torch.cat([type_representation, type_representation], dim=-1) ] for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0): output_tensors.append(layer_activations.squeeze(0)) return { 'activations': output_tensors, 'mask': mask, }
class _ElmoBiLm(torch.nn.Module): """ Run a pre-trained bidirectional language model, outputing the activations at each layer for weighting together into an ELMo representation (with ``allennlp.modules.seq2seq_encoders.Elmo``). This is a lower level class, useful for advanced uses, but most users should use ``allennlp.modules.seq2seq_encoders.Elmo`` directly. Parameters ---------- options_file : ``str`` ELMo JSON options file weight_file : ``str`` ELMo hdf5 weight file requires_grad: ``bool``, optional If True, compute gradient of ELMo parameters for fine tuning. vocab_to_cache : ``List[str]``, optional, (default = 0.5). A list of words to pre-compute and cache character convolutions for. If you use this option, _ElmoBiLm expects that you pass word indices of shape (batch_size, timesteps) to forward, instead of character indices. If you use this option and pass a word which wasn't pre-cached, this will break. """ def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False, vocab_to_cache: List[str] = None) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad) self._requires_grad = requires_grad if requires_grad and vocab_to_cache: logging.warning("You are fine tuning ELMo and caching char CNN word vectors. " "This behaviour is not guaranteed to be well defined, particularly. " "if not all of your inputs will occur in the vocabulary cache.") # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None self._bos_embedding: torch.Tensor = None self._eos_embedding: torch.Tensor = None if vocab_to_cache: logging.info("Caching character cnn layers for words in vocabulary.") # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding. # They are set in the method so they can be accessed from outside the # constructor. self.create_cached_cnn_embeddings(vocab_to_cache) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError('We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1 def get_output_dim(self): return 2 * self._token_embedder.get_output_dim() def forward(self, # pylint: disable=arguments-differ inputs: torch.Tensor, word_inputs: torch.Tensor = None) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: ``'activations'``: ``List[torch.Tensor]`` A list of activations at each layer of the network, each of shape ``(batch_size, timesteps + 2, embedding_dim)`` ``'mask'``: ``torch.Tensor`` Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask. Note that the output tensors all include additional special begin and end of sequence markers. """ if self._word_embedding is not None and word_inputs is not None: try: mask_without_bos_eos = (word_inputs > 0).long() # The character cnn part is cached - just look it up. embedded_inputs = self._word_embedding(word_inputs) # type: ignore # shape (batch_size, timesteps + 2, embedding_dim) type_representation, mask = add_sentence_boundary_token_ids( embedded_inputs, mask_without_bos_eos, self._bos_embedding, self._eos_embedding ) except RuntimeError: # Back off to running the character convolutions, # as we might not have the words in the cache. token_embedding = self._token_embedder(inputs) mask = token_embedding['mask'] type_representation = token_embedding['token_embedding'] else: token_embedding = self._token_embedder(inputs) mask = token_embedding['mask'] type_representation = token_embedding['token_embedding'] lstm_outputs = self._elmo_lstm(type_representation, mask) # Prepare the output. The first layer is duplicated. # Because of minor differences in how masking is applied depending # on whether the char cnn layers are cached, we'll be defensive and # multiply by the mask here. It's not strictly necessary, as the # mask passed on is correct, but the values in the padded areas # of the char cnn representations can change. output_tensors = [ torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1) ] for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0): output_tensors.append(layer_activations.squeeze(0)) return { 'activations': output_tensors, 'mask': mask, } def create_cached_cnn_embeddings(self, tokens: List[str]) -> None: """ Given a list of tokens, this method precomputes word representations by running just the character convolutions and highway layers of elmo, essentially creating uncontextual word vectors. On subsequent forward passes, the word ids are looked up from an embedding, rather than being computed on the fly via the CNN encoder. This function sets 3 attributes: _word_embedding : ``torch.Tensor`` The word embedding for each word in the tokens passed to this method. _bos_embedding : ``torch.Tensor`` The embedding for the BOS token. _eos_embedding : ``torch.Tensor`` The embedding for the EOS token. Parameters ---------- tokens : ``List[str]``, required. A list of tokens to precompute character convolutions for. """ tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens timesteps = 32 batch_size = 32 chunked_tokens = lazy_groups_of(iter(tokens), timesteps) all_embeddings = [] device = get_device_of(next(self.parameters())) for batch in lazy_groups_of(chunked_tokens, batch_size): # Shape (batch_size, timesteps, 50) batched_tensor = batch_to_ids(batch) # NOTE: This device check is for when a user calls this method having # already placed the model on a device. If this is called in the # constructor, it will probably happen on the CPU. This isn't too bad, # because it's only a few convolutions and will likely be very fast. if device >= 0: batched_tensor = batched_tensor.cuda(device) output = self._token_embedder(batched_tensor) token_embedding = output["token_embedding"] mask = output["mask"] token_embedding, _ = remove_sentence_boundaries(token_embedding, mask) all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1))) full_embedding = torch.cat(all_embeddings, 0) # We might have some trailing embeddings from padding in the batch, so # we clip the embedding and lookup to the right size. full_embedding = full_embedding[:len(tokens), :] embedding = full_embedding[2:len(tokens), :] vocab_size, embedding_dim = list(embedding.size()) from allennlp.modules.token_embedders import Embedding # type: ignore self._bos_embedding = full_embedding[0, :] self._eos_embedding = full_embedding[1, :] self._word_embedding = Embedding(vocab_size, # type: ignore embedding_dim, weight=embedding.data, trainable=self._requires_grad, padding_index=0)
def __init__(self, conf: Dict, word_batch: WordBatch, char_batch: CharacterBatch): super(BiLMBase, self).__init__() self.conf = conf c = conf['token_embedder'] if word_batch is not None: if 'pretrained' in c: embs = load_embedding_txt(c['pretrained'], c['has_header']) logger.info('loaded {0} embedding entries.'.format(len(embs[0]))) else: embs = None word_embedder = Embeddings(c['word_dim'], word_batch.mapping, embs=embs, fix_emb=False, normalize=False) else: word_embedder = None if char_batch is not None: dim = c.get('char_dim') if c.get('char_dim', 0) > 0 else c.get('wordpiece_dim') char_embedder = Embeddings(dim, char_batch.mapping, embs=None, fix_emb=False, normalize=False) else: char_embedder = None token_embedder_name = c['name'].lower() if token_embedder_name == 'cnn': self.token_embedder = ConvTokenEmbedder(output_dim=conf['encoder']['projection_dim'], word_embedder=word_embedder, char_embedder=char_embedder, filters=c['filters'], n_highway=c['n_highway'], activation=c['activation']) elif token_embedder_name == 'lstm': self.token_embedder = LstmTokenEmbedder(output_dim=conf['encoder']['projection_dim'], word_embedder=word_embedder, char_embedder=char_embedder, dropout=conf['dropout']) elif token_embedder_name == 'grecnn': self.token_embedder = GatedRecNNTokenEmbedder(output_dim=conf['encoder']['projection_dim'], word_embedder=word_embedder, char_embedder=char_embedder) elif token_embedder_name == 'sum': self.token_embedder = SumTokenEmbedder(output_dim=conf['encoder']['projection_dim'], word_embedder=word_embedder, char_embedder=char_embedder) else: raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) self.add_sentence_boundary = c.get('add_sentence_boundary', False) self.add_sentence_boundary_ids = c.get('add_sentence_boundary_ids', False) assert not (self.add_sentence_boundary and self.add_sentence_boundary_ids) if self.add_sentence_boundary: dim = self.token_embedder.get_output_dim() self.bos_embeddings = torch.nn.Parameter(torch.randn(dim) / math.sqrt(dim)) self.eos_embeddings = torch.nn.Parameter(torch.randn(dim) / math.sqrt(dim)) c = conf['encoder'] encoder_name = c['name'].lower() if encoder_name == 'elmo': # NOTE: for fare comparison, we set stateful to false self.encoder = ElmoLstm(input_size=c['projection_dim'], hidden_size=c['projection_dim'], cell_size=c['dim'], requires_grad=True, num_layers=c['n_layers'], recurrent_dropout_probability=conf['dropout'], memory_cell_clip_value=c['cell_clip'], state_projection_clip_value=c['proj_clip'], stateful=False) elif encoder_name == 'lstm': self.encoder = LstmbiLm(input_size=c['projection_dim'], hidden_size=c['projection_dim'], num_layers=c['n_layers'], dropout=conf['dropout']) elif encoder_name == 'bengio03highway': self.encoder = Bengio03HighwayBiLm(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_layers=c['n_layers'], n_highway=c['n_highway'], use_position=c.get('position', False), dropout=conf['dropout']) elif encoder_name == 'bengio03highway_v2': self.encoder = Bengio03HighwayBiLmV2(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_layers=c['n_layers'], n_highway=c['n_highway'], use_position=c.get('position', False), dropout=conf['dropout']) elif encoder_name == 'bengio03resnet': self.encoder = Bengio03ResNetBiLm(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_layers=c['n_layers'], use_position=c.get('position', False), dropout=conf['dropout']) elif encoder_name == 'lblhighway': self.encoder = LBLHighwayBiLm(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_layers=c['n_layers'], n_highway=c['n_highway'], use_position=c.get('position', False), dropout=conf['dropout']) elif encoder_name == 'lblhighway_v2': self.encoder = LBLHighwayBiLmV2(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_layers=c['n_layers'], n_highway=c['n_highway'], use_position=c.get('position', False), dropout=conf['dropout']) elif encoder_name == 'lblresnet': self.encoder = LBLResNetBiLm(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_layers=c['n_layers'], use_position=c.get('position', False), dropout=conf['dropout']) elif encoder_name == 'selfattn': self.encoder = SelfAttentiveLBLBiLM(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_heads=c['n_heads'], n_layers=c['n_layers'], n_highway=c['n_highway'], use_position=c.get('position', False), use_relative_position=c.get('relative_position_weights', False), dropout=conf['dropout']) elif encoder_name == 'selfattn_v2': self.encoder = SelfAttentiveLBLBiLMV2(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_heads=c['n_heads'], n_layers=c['n_layers'], n_highway=c['n_highway'], use_position=c.get('position', False), use_relative_position=c.get('relative_position_weights', False), dropout=conf['dropout']) elif encoder_name == 'selfattn_v3': self.encoder = SelfAttentiveLBLBiLMV3(width=c['width'], input_size=c['projection_dim'], hidden_size=c['projection_dim'], n_heads=c['n_heads'], n_layers=c['n_layers'], n_highway=c['n_highway'], use_position=c.get('position', False), use_relative_position=c.get('relative_position_weights', False), dropout=conf['dropout']) elif encoder_name == 'cnn': self.encoder = GatedCnnLm(input_size=c['projection_dim'], layers=c['layers'], dropout=conf['dropout']) else: raise ValueError('Unknown encoder name: {}'.format(encoder_name)) self.output_dim = conf['encoder']['projection_dim'] self.token_embedding_time = 0 self.encoding_time = 0