def __init__( self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, last_layer_only: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, masked_language_modeling: bool = True, ) -> None: TokenEmbedder.__init__(self) # Call the base class constructor tokenizer = PretrainedTransformerTokenizer(model_name) self.masked_language_modeling = masked_language_modeling if self.masked_language_modeling: self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True) # We only need access to the HF tokenizer if we are masked language modeling self.tokenizer = tokenizer.tokenizer # The only differences when masked language modeling are: # 1) `output_hidden_states` must be True to get access to token embeddings. # 2) We need to use `AutoModelForMaskedLM` to get the correct model self.transformer_model = AutoModelForMaskedLM.from_pretrained( model_name, config=self.config) # Eveything after the if statement (including the else) is copied directly from: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py else: from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file, override_weights_strip_prefix) self.config = self.transformer_model.config if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size self._scalar_mix: Optional[ScalarMix] = None if not last_layer_only: self._scalar_mix = ScalarMix(self.config.num_hidden_layers) self.config.output_hidden_states = True self._num_added_start_tokens = len( tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens if not train_parameters: for param in self.transformer_model.parameters(): param.requires_grad = False
def from_params( cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder': # type: ignore # pylint: disable=arguments-differ,bad-super-call # The original `from_params` for this class was designed in a way that didn't agree # with the constructor. The constructor wants a 'token_embedders' parameter that is a # `Dict[str, TokenEmbedder]`, but the original `from_params` implementation expected those # key-value pairs to be top-level in the params object. # # This breaks our 'configuration wizard' and configuration checks. Hence, going forward, # the params need a 'token_embedders' key so that they line up with what the constructor wants. # For now, the old behavior is still supported, but produces a DeprecationWarning. embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None) if embedder_to_indexer_map is not None: embedder_to_indexer_map = embedder_to_indexer_map.as_dict( quiet=True) allow_unmatched_keys = params.pop_bool("allow_unmatched_keys", False) use_fp16 = params.pop_bool("use_fp16", False) token_embedder_params = params.pop('token_embedders', None) if token_embedder_params is not None: # New way: explicitly specified, so use it. token_embedders = { name: TokenEmbedder.from_params(subparams, vocab=vocab) for name, subparams in token_embedder_params.items() } else: # Warn that the original behavior is deprecated warnings.warn( DeprecationWarning( "the token embedders for BasicTextFieldEmbedder should now " "be specified as a dict under the 'token_embedders' key, " "not as top-level key-value pairs")) token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) token_embedders[key] = TokenEmbedder.from_params( vocab=vocab, params=embedder_params) params.assert_empty(cls.__name__) return cls(use_fp16, token_embedders, embedder_to_indexer_map, allow_unmatched_keys)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder': token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) token_embedders[key] = TokenEmbedder.from_params(vocab, embedder_params) params.assert_empty(cls.__name__) return cls(token_embedders)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder': token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) token_embedders[key] = TokenEmbedder.from_params(vocab, embedder_params) params.assert_empty(cls.__name__) return cls(token_embedders)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder': # type: ignore # pylint: disable=arguments-differ,bad-super-call # The original `from_params` for this class was designed in a way that didn't agree # with the constructor. The constructor wants a 'token_embedders' parameter that is a # `Dict[str, TokenEmbedder]`, but the original `from_params` implementation expected those # key-value pairs to be top-level in the params object. # # This breaks our 'configuration wizard' and configuration checks. Hence, going forward, # the params need a 'token_embedders' key so that they line up with what the constructor wants. # For now, the old behavior is still supported, but produces a DeprecationWarning. embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None) if embedder_to_indexer_map is not None: embedder_to_indexer_map = embedder_to_indexer_map.as_dict(quiet=True) allow_unmatched_keys = params.pop_bool("allow_unmatched_keys", False) token_embedder_params = params.pop('token_embedders', None) if token_embedder_params is not None: # New way: explicitly specified, so use it. token_embedders = { name: TokenEmbedder.from_params(subparams, vocab=vocab) for name, subparams in token_embedder_params.items() } else: # Warn that the original behavior is deprecated warnings.warn(DeprecationWarning("the token embedders for BasicTextFieldEmbedder should now " "be specified as a dict under the 'token_embedders' key, " "not as top-level key-value pairs")) token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) token_embedders[key] = TokenEmbedder.from_params(vocab=vocab, params=embedder_params) params.assert_empty(cls.__name__) return cls(token_embedders, embedder_to_indexer_map, allow_unmatched_keys)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder': embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None) if embedder_to_indexer_map is not None: embedder_to_indexer_map = embedder_to_indexer_map.as_dict(quiet=True) token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) token_embedders[key] = TokenEmbedder.from_params(vocab, embedder_params) params.assert_empty(cls.__name__) return cls(token_embedders, embedder_to_indexer_map)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'OptionalTextFieldEmbedder': token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) if ("embedding_dim" in embedder_params and \ embedder_params.get("embedding_dim") == 0): # skip 0-dimensional embedders continue token_embedders[key] = TokenEmbedder.from_params( vocab, embedder_params) params.assert_empty(cls.__name__) return cls(token_embedders)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder': # type: ignore # pylint: disable=arguments-differ embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None) if embedder_to_indexer_map is not None: embedder_to_indexer_map = embedder_to_indexer_map.as_dict(quiet=True) allow_unmatched_keys = params.pop_bool("allow_unmatched_keys", False) token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) token_embedders[key] = TokenEmbedder.from_params(vocab=vocab, params=embedder_params) params.assert_empty(cls.__name__) return cls(token_embedders, embedder_to_indexer_map, allow_unmatched_keys)
def test_registry_has_builtin_token_embedders(self): assert TokenEmbedder.by_name("embedding").__name__ == 'Embedding' assert TokenEmbedder.by_name("character_encoding").__name__ == 'TokenCharactersEncoder'
# Custom vocab_to_cache logic requires a from_params implementation. @classmethod def from_params(cls, vocab, params): # type: ignore # pylint: disable=arguments-differ params.add_file_to_archive(u'options_file') params.add_file_to_archive(u'weight_file') options_file = params.pop(u'options_file') weight_file = params.pop(u'weight_file') requires_grad = params.pop(u'requires_grad', False) do_layer_norm = params.pop_bool(u'do_layer_norm', False) dropout = params.pop_float(u"dropout", 0.5) namespace_to_cache = params.pop(u"namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int(u"projection_dim", None) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache) ElmoTokenEmbedder = TokenEmbedder.register(u"elmo_token_embedder")( ElmoTokenEmbedder)
def test_registry_has_builtin_token_embedders(self): assert TokenEmbedder.by_name("embedding").__name__ == "from_vocab_or_file" assert TokenEmbedder.by_name("character_encoding").__name__ == "TokenCharactersEncoder"
def test_registry_has_builtin_token_embedders(self): assert TokenEmbedder.by_name("embedding").__name__ == 'Embedding' assert TokenEmbedder.by_name("character_encoding").__name__ == 'TokenCharactersEncoder'
def __init__( self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, last_layer_only: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, gradient_checkpointing: Optional[bool] = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, transformer_kwargs: Optional[Dict[str, Any]] = None, masked_language_modeling: bool = True, load_directory: Optional[str] = None ) -> None: TokenEmbedder.__init__(self) # Call the base class constructor tokenizer = PretrainedTransformerTokenizer(model_name, tokenizer_kwargs=tokenizer_kwargs) self.masked_language_modeling = masked_language_modeling if self.masked_language_modeling: self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True) # We only need access to the HF tokenizer if we are masked language modeling self.tokenizer = tokenizer.tokenizer # The only differences when masked language modeling are: # 1) `output_hidden_states` must be True to get access to token embeddings. # 2) We need to use `AutoModelForMaskedLM` to get the correct model self.transformer_model = AutoModelForMaskedLM.from_pretrained( # self.transformer_model = RobertaForAugment.from_pretrained() model_name, config=self.config, **(transformer_kwargs or {}) ) if load_directory is not None: print("Loading Model from:", load_directory) state = torch.load(load_directory) model_dict = self.transformer_model.state_dict() # ckpt__dict = state['state_dict'] state = {k: v for k, v in state.items() if k in model_dict} model_dict.update(state) self.transformer_model.load_state_dict(model_dict, strict=False) print("Loading Model from:", load_directory, "...Finished.") # Eveything after the if statement (including the else) is copied directly from: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py else: from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file, override_weights_strip_prefix ) self.config = self.transformer_model.config if gradient_checkpointing is not None: self.transformer_model.config.update({"gradient_checkpointing": gradient_checkpointing}) if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) # print("max_length", max_length) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size self._scalar_mix: Optional[ScalarMix] = None if not last_layer_only: self._scalar_mix = ScalarMix(self.config.num_hidden_layers) self.config.output_hidden_states = True self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens self.encoder = BertEncoder(self.config) self.layer = torch.nn.ModuleList([BertLayer(self.config) for _ in range(self.config.num_hidden_layers)]) self.embeddings = BertEmbeddings(self.config) self.output_hidden_states = self.config.output_hidden_states if not train_parameters: for param in self.transformer_model.parameters(): param.requires_grad = False
self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x def get_output_dim(self): return self._encoder._module.get_output_dim() # pylint: disable=protected-access def forward(self, token_characters): # pylint: disable=arguments-differ mask = (token_characters != 0).long() return self._dropout( self._encoder(self._embedding(token_characters), mask)) # The setdefault requires a custom from_params @classmethod def from_params(cls, vocab, params): # type: ignore # pylint: disable=arguments-differ embedding_params = params.pop(u"embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. embedding_params.setdefault(u"vocab_namespace", u"token_characters") embedding = Embedding.from_params(vocab, embedding_params) encoder_params = params.pop(u"encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop_float(u"dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout) TokenCharactersEncoder = TokenEmbedder.register(u"character_encoding")( TokenCharactersEncoder)
else: weight = None return cls(num_embeddings=num_embeddings, embedding_dim=embedding_dim, projection_dim=projection_dim, weight=weight, padding_index=padding_index, trainable=trainable, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse) Embedding = TokenEmbedder.register(u"embedding")(Embedding) def _read_pretrained_embeddings_file(file_uri, embedding_dim, vocab, namespace=u"tokens"): u""" Returns and embedding matrix for the given vocabulary using the pretrained embeddings contained in the given file. Embeddings for tokens not found in the pretrained embedding file are randomly initialized using a normal distribution with mean and standard deviation equal to those of the pretrained embeddings. We support two file formats: * text format - utf-8 encoded text file with space separated fields: [word] [dim 1] [dim 2] ...
num_timesteps, device=get_device_of(inputs)) + vocab_size # Combine the inputs with positional encodings batch_tensor = torch.stack( [ inputs, # (batch_size, num_timesteps) positional_encodings.expand(batch_size, num_timesteps) ], dim=-1) byte_pairs_mask = inputs != 0 # Embeddings is num_output_layers x (batch_size, num_timesteps, embedding_dim) layer_activations = self._transformer(batch_tensor) # Output of scalar_mix is (batch_size, num_timesteps, embedding_dim) mix = self._scalar_mix(layer_activations, byte_pairs_mask) # These embeddings are one per byte-pair, but we want one per original _word_. # So we choose the embedding corresponding to the last byte pair for each word, # which is captured by the ``offsets`` input. range_vector = get_range_vector(batch_size, device=get_device_of(mix)).unsqueeze(1) last_byte_pair_embeddings = mix[range_vector, offsets] return last_byte_pair_embeddings OpenaiTransformerEmbedder = TokenEmbedder.register( u"openai_transformer_embedder")(OpenaiTransformerEmbedder)