def parameter_setup(self, args): # Set trainability of this module. for param in self.model.parameters(): param.requires_grad = bool(args.transfer_paradigm == "finetune") self.num_layers = self.model.config.num_hidden_layers if args.pytorch_transformers_max_layer >= 0: self.max_layer = args.pytorch_transformers_max_layer assert self.max_layer <= self.num_layers else: self.max_layer = self.num_layers # Configure scalar mixing, ELMo-style. if self.embeddings_mode == "mix": if args.transfer_paradigm == "frozen": log.warning( "NOTE: pytorch_transformers_output_mode='mix', so scalar " "mixing weights will be fine-tuned even if BERT " "model is frozen.") # TODO: if doing multiple target tasks, allow for multiple sets of # scalars. See the ELMo implementation here: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115 assert len(parse_task_list_arg(args.target_tasks)) <= 1, ( "pytorch_transformers_output_mode='mix' only supports a single set of " "scalars (but if you need this feature, see the TODO in " "the code!)") # Always have one more mixing weight, for lexical layer. self.scalar_mix = scalar_mix.ScalarMix(self.max_layer + 1, do_layer_norm=False)
def __init__(self, args, cache_dir=None): super(BertEmbedderModule, self).__init__() self.model = \ pytorch_pretrained_bert.BertModel.from_pretrained( args.bert_model_name, cache_dir=cache_dir) self.embeddings_mode = args.bert_embeddings_mode # Set trainability of this module. for param in self.model.parameters(): param.requires_grad = bool(args.bert_fine_tune) # Configure scalar mixing, ELMo-style. if self.embeddings_mode == "mix": if not args.bert_fine_tune: log.warning("NOTE: bert_embeddings_mode='mix', so scalar " "mixing weights will be fine-tuned even if BERT " "model is frozen.") # TODO: if doing multiple target tasks, allow for multiple sets of # scalars. See the ELMo implementation here: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115 assert len(parse_task_list_arg(args.target_tasks)) <= 1, \ ("bert_embeddings_mode='mix' only supports a single set of " "scalars (but if you need this feature, see the TODO in " "the code!)") num_layers = self.model.config.num_hidden_layers self.scalar_mix = scalar_mix.ScalarMix(num_layers + 1, do_layer_norm=False)
def __init__(self, args, cache_dir=None): super(BertEmbedderModule, self).__init__() if "bert_model_file" in args: if "bert_classification" in args and args.bert_classification == 1: log.info( "Loading fine-tuned BERT Classfication model from file.") self.model = PretrainedBertForSequenceClassification.from_pretrained( args.bert_model_name, num_labels=192) else: log.info("Loading fine-tuned BERT QA model from file.") self.model = PretrainedBertForQuestionAnswering.from_pretrained( args.bert_model_name) self.model.load_state_dict(torch.load(args.bert_model_file)) else: log.info("Loading pretrained BERT model without fine-tuning.") self.model = pytorch_pretrained_bert.BertModel.from_pretrained( args.bert_model_name, cache_dir=cache_dir) self.embeddings_mode = args.bert_embeddings_mode self.embedding_layer = args.bert_embedding_layer tokenizer = \ pytorch_pretrained_bert.BertTokenizer.from_pretrained( args.bert_model_name, cache_dir=cache_dir) self._sep_id = tokenizer.vocab["[SEP]"] self._pad_id = tokenizer.vocab["[PAD]"] # Set trainability of this module. for param in self.model.parameters(): param.requires_grad = bool(args.transfer_paradigm == 'finetune') # Configure scalar mixing, ELMo-style. if self.embeddings_mode == "mix": if args.transfer_paradigm == 'frozen': log.warning("NOTE: bert_embeddings_mode='mix', so scalar " "mixing weights will be fine-tuned even if BERT " "model is frozen.") # TODO: if doing multiple target tasks, allow for multiple sets of # scalars. See the ELMo implementation here: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115 assert len(parse_task_list_arg(args.target_tasks)) <= 1, \ ("bert_embeddings_mode='mix' only supports a single set of " "scalars (but if you need this feature, see the TODO in " "the code!)") num_layers = self.model.config.num_hidden_layers self.scalar_mix = scalar_mix.ScalarMix(num_layers + 1, do_layer_norm=False)
def __init__(self, cfg, vocab=40990, n_ctx=512, embeddings_mode='none'): super(TransformerModel, self).__init__() self.embeddings_mode = embeddings_mode self.n_embd = cfg.n_embd self.vocab = vocab self.embed = nn.Embedding(vocab, cfg.n_embd) self.drop = nn.Dropout(cfg.embd_pdrop) block = model_pytorch.Block(n_ctx, cfg, scale=True) self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)]) nn.init.normal_(self.embed.weight, std=0.02) if self.embeddings_mode == "mix": self.scalar_mix = scalar_mix.ScalarMix(cfg.n_layer + 1, do_layer_norm=False)
def __init__(self, args, cache_dir=None): super(BertEmbedderModule, self).__init__() if args.bert_use_pretrain: self.model = BertModel.from_pretrained( args.input_module, cache_dir=cache_dir ) else: self.config = BertConfig(args.bert_config_file) self.model = BertModel(self.config) self.embeddings_mode = args.bert_embeddings_mode tokenizer = BertTokenizer.from_pretrained( args.input_module, cache_dir=cache_dir ) self._cls_id = tokenizer.vocab["[CLS]"] self._sep_id = tokenizer.vocab["[SEP]"] self._pad_id = tokenizer.vocab["[PAD]"] # Set trainability of this module. for param in self.model.parameters(): param.requires_grad = bool(args.transfer_paradigm == "finetune") # Configure scalar mixing, ELMo-style. if self.embeddings_mode == "mix": if args.transfer_paradigm == "frozen": log.warning( "NOTE: bert_embeddings_mode='mix', so scalar " "mixing weights will be fine-tuned even if BERT " "model is frozen." ) # TODO: if doing multiple target tasks, allow for multiple sets of # scalars. See the ELMo implementation here: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115 assert len(parse_task_list_arg(args.target_tasks)) <= 1, ( "bert_embeddings_mode='mix' only supports a single set of " "scalars (but if you need this feature, see the TODO in " "the code!)" ) num_layers = self.model.config.num_hidden_layers self.scalar_mix = scalar_mix.ScalarMix(num_layers + 1, do_layer_norm=False)
def parameter_setup(self, args): # Set trainability of this module. for param in self.model.parameters(): param.requires_grad = bool(args.transfer_paradigm == "finetune") self.num_layers = FLAGS.nb_encoder_layers if args.transformers_max_layer >= 0: self.max_layer = args.transformers_max_layer assert self.max_layer <= self.num_layers else: self.max_layer = self.num_layers if args.transfer_paradigm == "frozen": if isinstance(self, (OpenAIGPTEmbedderModule, GPT2EmbedderModule, TransfoXLEmbedderModule)): log.warning( "NOTE: OpenAI GPT, GPT-2 and Transformer-XL add new tokens for classification" "tasks, under 'frozen' transfer_paradigm, their embeddings will not be trained" ) # # ure scalar mixing, ELMo-style. if self.output_mode == "mix": if args.transfer_paradigm == "frozen": log.warning("NOTE: transformers_output_mode='mix', so scalar " "mixing weights will be fine-tuned even if BERT " "model is frozen.") # TODO: if doing multiple target tasks, allow for multiple sets of # scalars. See the ELMo implementation here: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115 assert len(parse_task_list_arg(args.target_tasks)) <= 1, ( "transformers_output_mode='mix' only supports a single set of " "scalars (but if you need this feature, see the TODO in " "the code!)") # Always have one more mixing weight, for lexical layer. self.scalar_mix = scalar_mix.ScalarMix(self.max_layer + 1, do_layer_norm=False)