def __init__(self, archive_file: str, dropout: float = None, bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"), remove_bos_eos: bool = True, requires_grad: bool = False) -> None: super().__init__() overrides = {"model": {"contextualizer": {"return_all_layers": True}}} # Import here to avoid circular dependency. from allennlp.models.archival import load_archive # Load LM and the associated config. archive = load_archive(archive_file, overrides=json.dumps(overrides)) self._lm: LanguageModel = archive.model self._lm.delete_softmax() config = archive.config dict_config = config.as_dict(quiet=True) # Extract the name of the tokens that the LM was trained on. text_field_embedder = dict_config["model"]["text_field_embedder"] token_names = list(text_field_embedder["token_embedders"].keys()) if len(token_names) != 1: # We don't currently support embedding with language models trained with multiple # embedded indices. # # Note: We only care about embedded indices. This does not include "tokens" which # is just used to compute the loss in LanguageModel. raise ConfigurationError( f"LM from {archive_file} trained with multiple embedders!") if "embedder_to_indexer_map" in text_field_embedder: # Similarly we don't support multiple indexers per embedder. raise ConfigurationError( f"LM from {archive_file} trained with embedder_to_indexer_map!" ) self._token_name = token_names[0] # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the # BasicTextFieldEmbedder concatenates multiple embedded representations. When a # downstream model uses both, tokens and token characters, say, and only adds bos/eos # tokens to the token characters, the dimensions don't match. See: # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109 # # For the equivalent hack in the ELMo embedder see: # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590 if bos_eos_tokens: dataset_reader_config = config.get("dataset_reader") if dataset_reader_config.get("type") == "multiprocess": dataset_reader_config = dataset_reader_config.get( "base_reader") token_indexer_config = dataset_reader_config.get( "token_indexers").get(self._token_name) token_indexer: TokenIndexer = TokenIndexer.from_params( token_indexer_config) token_list = [Token(token) for token in bos_eos_tokens] # TODO(brendanr): Obtain these indices from the vocab once the # ELMoTokenCharactersIndexer adds the mappings. bos_eos_indices = token_indexer.tokens_to_indices( token_list, self._lm.vocab, "key")["key"] self._bos_indices = torch.Tensor(bos_eos_indices[0]) self._eos_indices = torch.Tensor(bos_eos_indices[1]) else: self._bos_indices = None self._eos_indices = None if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x self._remove_bos_eos = remove_bos_eos num_layers = self._lm.num_layers() # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead. # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76 self._scalar_mix = ScalarMix(mixture_size=num_layers, do_layer_norm=False, trainable=True) # pylint: disable=protected-access character_dim = self._lm._text_field_embedder.get_output_dim() contextual_dim = self._lm._contextualizer.get_output_dim() if contextual_dim % character_dim != 0: raise ConfigurationError( "The output dimensions for the text_field_embedder " + f"({character_dim}) and the contextualizer ({contextual_dim})" + f" from the language model loaded from {archive_file} are " + "not compatible. Please check the config used to train that " + "model and ensure that the output dimension of the " + "text_field_embedder divides the output dimension of the " + "contextualizer.") self._character_embedding_duplication_count = contextual_dim // character_dim for param in self._lm.parameters(): param.requires_grad = requires_grad
def __init__(self, transformer: OpenaiTransformer) -> None: super().__init__() self._transformer = transformer self._scalar_mix = ScalarMix(transformer.num_output_layers, do_layer_norm=False)
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, top_layer_only: bool = True, bert_weights_model: str = None, per_choice_loss: bool = False, layer_freeze_regexes: List[str] = None, regularizer: Optional[RegularizerApplicator] = None, use_comparative_bert: bool = True, use_bilinear_classifier: bool = False, train_comparison_layer: bool = False, number_of_choices_compared: int = 0, comparison_layer_hidden_size: int = -1, comparison_layer_use_relu: bool = True) -> None: super().__init__(vocab, regularizer) self._use_comparative_bert = use_comparative_bert self._use_bilinear_classifier = use_bilinear_classifier self._train_comparison_layer = train_comparison_layer if train_comparison_layer: assert number_of_choices_compared > 1 self._num_choices = number_of_choices_compared self._comparison_layer_hidden_size = comparison_layer_hidden_size self._comparison_layer_use_relu = comparison_layer_use_relu # Bert weights and config if bert_weights_model: logging.info(f"Loading BERT weights model from {bert_weights_model}") bert_model_loaded = load_archive(bert_weights_model) self._bert_model = bert_model_loaded.model._bert_model else: self._bert_model = BertModel.from_pretrained(pretrained_model) for param in self._bert_model.parameters(): param.requires_grad = requires_grad #for name, param in self._bert_model.named_parameters(): # grad = requires_grad # if layer_freeze_regexes and grad: # grad = not any([bool(re.search(r, name)) for r in layer_freeze_regexes]) # param.requires_grad = grad bert_config = self._bert_model.config self._output_dim = bert_config.hidden_size self._dropout = torch.nn.Dropout(bert_config.hidden_dropout_prob) self._per_choice_loss = per_choice_loss # Bert Classifier selector final_output_dim = 1 if not use_comparative_bert: if bert_weights_model and hasattr(bert_model_loaded.model, "_classifier"): self._classifier = bert_model_loaded.model._classifier else: self._classifier = Linear(self._output_dim, final_output_dim) else: if use_bilinear_classifier: self._classifier = Bilinear(self._output_dim, self._output_dim, final_output_dim) else: self._classifier = Linear(self._output_dim * 2, final_output_dim) self._classifier.apply(self._bert_model.init_bert_weights) # Comparison layer setup if self._train_comparison_layer: number_of_pairs = self._num_choices * (self._num_choices - 1) if self._comparison_layer_hidden_size == -1: self._comparison_layer_hidden_size = number_of_pairs * number_of_pairs self._comparison_layer_1 = Linear(number_of_pairs, self._comparison_layer_hidden_size) if self._comparison_layer_use_relu: self._comparison_layer_1_activation = torch.nn.LeakyReLU() else: self._comparison_layer_1_activation = torch.nn.Tanh() self._comparison_layer_2 = Linear(self._comparison_layer_hidden_size, self._num_choices) self._comparison_layer_2_activation = torch.nn.Softmax() # Scalar mix, if necessary self._all_layers = not top_layer_only if self._all_layers: if bert_weights_model and hasattr(bert_model_loaded.model, "_scalar_mix") \ and bert_model_loaded.model._scalar_mix is not None: self._scalar_mix = bert_model_loaded.model._scalar_mix else: num_layers = bert_config.num_hidden_layers initial_scalar_parameters = num_layers * [0.0] initial_scalar_parameters[-1] = 5.0 # Starts with most mass on last layer self._scalar_mix = ScalarMix(bert_config.num_hidden_layers, initial_scalar_parameters=initial_scalar_parameters, do_layer_norm=False) else: self._scalar_mix = None # Accuracy and loss setup if self._train_comparison_layer: self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() else: self._accuracy = BooleanAccuracy() self._loss = torch.nn.BCEWithLogitsLoss() self._debug = -1
def __init__(self, transformer): super(OpenaiTransformerEmbedder, self).__init__() self._transformer = transformer self._scalar_mix = ScalarMix(transformer.num_output_layers, do_layer_norm=False)
def __init__( self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, last_layer_only: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, gradient_checkpointing: Optional[bool] = None, masked_language_modeling: bool = True, ) -> None: TokenEmbedder.__init__(self) # Call the base class constructor tokenizer = PretrainedTransformerTokenizer(model_name) self.masked_language_modeling = masked_language_modeling if self.masked_language_modeling: self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True) # We only need access to the HF tokenizer if we are masked language modeling self.tokenizer = tokenizer.tokenizer # The only differences when masked language modeling are: # 1) `output_hidden_states` must be True to get access to token embeddings. # 2) We need to use `AutoModelForMaskedLM` to get the correct model self.transformer_model = AutoModelForMaskedLM.from_pretrained( model_name, config=self.config) # Eveything after the if statement (including the else) is copied directly from: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py else: from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file, override_weights_strip_prefix) self.config = self.transformer_model.config if gradient_checkpointing is not None: self.transformer_model.config.update( {"gradient_checkpointing": gradient_checkpointing}) if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size self._scalar_mix: Optional[ScalarMix] = None if not last_layer_only: self._scalar_mix = ScalarMix(self.config.num_hidden_layers) self.config.output_hidden_states = True self._num_added_start_tokens = len( tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens if not train_parameters: for param in self.transformer_model.parameters(): param.requires_grad = False
def __init__( self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, last_layer_only: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, gradient_checkpointing: Optional[bool] = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, transformer_kwargs: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file=override_weights_file, override_weights_strip_prefix=override_weights_strip_prefix, **(transformer_kwargs or {}), ) if gradient_checkpointing is not None: self.transformer_model.config.update( {"gradient_checkpointing": gradient_checkpointing}) self.config = self.transformer_model.config if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size self._scalar_mix: Optional[ScalarMix] = None if not last_layer_only: self._scalar_mix = ScalarMix(self.config.num_hidden_layers) self.config.output_hidden_states = True tokenizer = PretrainedTransformerTokenizer( model_name, tokenizer_kwargs=tokenizer_kwargs, ) try: if self.transformer_model.get_input_embeddings( ).num_embeddings != len(tokenizer.tokenizer): self.transformer_model.resize_token_embeddings( len(tokenizer.tokenizer)) except NotImplementedError: # Can't resize for transformers models that don't implement base_model.get_input_embeddings() logger.warning( "Could not resize the token embedding matrix of the transformer model. " "This model does not support resizing.") self._num_added_start_tokens = len( tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens self.train_parameters = train_parameters if not train_parameters: self.transformer_model.eval() for param in self.transformer_model.parameters(): param.requires_grad = False