def parameter_setup(self, args): # Set trainability of this module. for param in self.model.parameters(): param.requires_grad = bool(args.transfer_paradigm == "finetune") self.num_layers = self.model.config.num_hidden_layers if args.pytorch_transformers_max_layer >= 0: self.max_layer = args.pytorch_transformers_max_layer assert self.max_layer <= self.num_layers else: self.max_layer = self.num_layers # Configure scalar mixing, ELMo-style. if self.embeddings_mode == "mix": if args.transfer_paradigm == "frozen": log.warning( "NOTE: pytorch_transformers_output_mode='mix', so scalar " "mixing weights will be fine-tuned even if BERT " "model is frozen.") # TODO: if doing multiple target tasks, allow for multiple sets of # scalars. See the ELMo implementation here: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115 assert len(parse_task_list_arg(args.target_tasks)) <= 1, ( "pytorch_transformers_output_mode='mix' only supports a single set of " "scalars (but if you need this feature, see the TODO in " "the code!)") # Always have one more mixing weight, for lexical layer. self.scalar_mix = scalar_mix.ScalarMix(self.max_layer + 1, do_layer_norm=False)
def __init__(self, args, n_special=3, n_ctx=512): super(OpenAIEmbedderModule, self).__init__() self.model_cfg = model_pytorch.DEFAULT_CONFIG self.n_special = n_special # number of special tokens self.n_ctx = n_ctx # max context width (seq len) full_emb_vocab = N_VOCAB + self.n_special + self.n_ctx self.model = TransformerModel( self.model_cfg, vocab=full_emb_vocab, embeddings_mode=args.openai_embeddings_mode) # Need specific seed to reproduce results. seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if args.openai_transformer_ckpt: assert n_special == 3 log.info("Loading OpenAI transformer model from %s", args.openai_transformer_ckpt) load_from_tf_checkpoint(self.model, args.openai_transformer_ckpt) else: loader_args = dict(n_special=n_special) # Path to model weights loader_args["path"] = OPENAI_DATA_DIR + "/" # Path to variable name mapping loader_args["path_names"] = os.path.dirname( model_pytorch.__file__) + "/" # Load pretrained weights from disk log.info("Loading OpenAI transformer model from %s", loader_args["path"]) model_pytorch.load_openai_pretrained_model(self.model, **loader_args) log.info("Loaded OpenAI transformer model.") # Set trainability of this module. for param in self.model.parameters(): param.requires_grad = bool(args.transfer_paradigm == "finetune") # Configure scalar mixing, ELMo-style. if args.openai_embeddings_mode == "mix": # TODO: if doing multiple target tasks, allow for multiple sets of # scalars. See the ELMo implementation here: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115 assert len(parse_task_list_arg(args.target_tasks)) <= 1, ( "openai_embeddings_mode='mix' only supports a single set of " "scalars (but if you need this feature, see the TODO in " "the code!)") if args.transfer_paradigm == "frozen": log.warning("NOTE: openai_embeddings_mode='mix', so scalar " "mixing weights will be fine-tuned even if " "transformer weights are frozen.") # Make sure scalar mix is always tunable. for param in self.model.scalar_mix.parameters(): param.requires_grad = True
def __init__(self, args, cache_dir=None): super(BertEmbedderModule, self).__init__() self.model = pytorch_pretrained_bert.BertModel.from_pretrained( args.input_module, cache_dir=cache_dir) self.embeddings_mode = args.bert_embeddings_mode self.num_layers = self.model.config.num_hidden_layers if args.bert_max_layer >= 0: self.max_layer = args.bert_max_layer else: self.max_layer = self.num_layers assert self.max_layer <= self.num_layers tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( args.input_module, cache_dir=cache_dir) self._sep_id = tokenizer.vocab["[SEP]"] self._pad_id = tokenizer.vocab["[PAD]"] # Set trainability of this module. for param in self.model.parameters(): param.requires_grad = bool(args.transfer_paradigm == "finetune") # Configure scalar mixing, ELMo-style. if self.embeddings_mode == "mix": if args.transfer_paradigm == "frozen": log.warning("NOTE: bert_embeddings_mode='mix', so scalar " "mixing weights will be fine-tuned even if BERT " "model is frozen.") # TODO: if doing multiple target tasks, allow for multiple sets of # scalars. See the ELMo implementation here: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115 assert len(parse_task_list_arg(args.target_tasks)) <= 1, ( "bert_embeddings_mode='mix' only supports a single set of " "scalars (but if you need this feature, see the TODO in " "the code!)") # Always have one more mixing weight, for lexical layer. self.scalar_mix = scalar_mix.ScalarMix(self.max_layer + 1, do_layer_norm=False)