def xlnetModel(*args, **kwargs): """ xlnetModel is the basic XLNet Transformer model from "XLNet: Generalized Autoregressive Pretraining for Language Understanding" by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased') # Prepare tokenized input >>> text_1 = "Who was Jim Henson ?" >>> text_2 = "Jim Henson was a puppeteer" >>> indexed_tokens_1 = tokenizer.encode(text_1) >>> indexed_tokens_2 = tokenizer.encode(text_2) >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1]) >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2]) # Load xlnetModel >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased') >>> model.eval() # Predict hidden states features for each layer >>> with torch.no_grad(): hidden_states_1, mems = model(tokens_tensor_1) hidden_states_2, mems = model(tokens_tensor_2, past=mems) """ model = XLNetModel.from_pretrained(*args, **kwargs) return model
def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a language model either by supplying * the name of a remote model on s3 ("xlnet-base-cased" ...) * or a local path of a model trained via transformers ("some_dir/huggingface_model") * or a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: name or path of a model :param language: (Optional) Name of language the model was trained for (e.g. "german"). If not supplied, FARM will try to infer it from the model name. :return: Language Model """ xlnet = cls() if "farm_lm_name" in kwargs: xlnet.name = kwargs["farm_lm_name"] else: xlnet.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = os.path.join(pretrained_model_name_or_path, "language_model_config.json") if os.path.exists(farm_lm_config): # FARM style config = XLNetConfig.from_pretrained(farm_lm_config) farm_lm_model = os.path.join(pretrained_model_name_or_path, "language_model.bin") xlnet.model = XLNetModel.from_pretrained(farm_lm_model, config=config, **kwargs) xlnet.language = xlnet.model.config.language else: # Pytorch-transformer Style xlnet.model = XLNetModel.from_pretrained( pretrained_model_name_or_path, **kwargs) xlnet.language = cls._infer_language_from_name( pretrained_model_name_or_path) config = xlnet.model.config # XLNet does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. # The pooler takes the last hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we # feed everything to the prediction head config.summary_last_dropout = 0 xlnet.pooler = SequenceSummary(config) xlnet.pooler.apply(xlnet.model._init_weights) return xlnet
def __init__(self, data): super(GazLSTM, self).__init__() self.gpu = data.HP_gpu self.use_biword = data.use_bigram self.hidden_dim = data.HP_hidden_dim self.word_emb_dim = data.word_emb_dim self.biword_emb_dim = data.biword_emb_dim self.bilstm_flag = data.HP_bilstm self.lstm_layer = data.HP_lstm_layer self.num_layer = data.HP_num_layer self.model_type = data.model_type self.use_bert = data.use_bert self.device = data.device self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.word_emb_dim, padding_idx=0) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_(torch.from_numpy(data.pretrain_word_embedding)) if self.use_biword: self.biword_embedding = nn.Embedding(data.biword_alphabet.size(), self.biword_emb_dim, padding_idx=0) if data.pretrain_biword_embedding is not None: self.biword_embedding.weight.data.copy_(torch.from_numpy(data.pretrain_biword_embedding)) char_feature_dim = self.word_emb_dim if self.use_biword: char_feature_dim += self.biword_emb_dim if self.use_bert: char_feature_dim = char_feature_dim + 768 * 2 print('total char_feature_dim is {}'.format(char_feature_dim)) print('bert + bert_wwm multi feature') ## lstm model if self.model_type == 'lstm': lstm_hidden = self.hidden_dim if self.bilstm_flag: self.hidden_dim *= 2 self.NERmodel = NERmodel(model_type='lstm', input_dim=char_feature_dim, hidden_dim=lstm_hidden, num_layer=self.lstm_layer, biflag=self.bilstm_flag) self.hidden2tag = nn.Linear(self.hidden_dim, data.label_alphabet_size + 2) # ## cnn model # if self.model_type == 'cnn': # self.NERmodel = NERmodel(model_type='cnn', input_dim=char_feature_dim, hidden_dim=self.hidden_dim, # num_layer=self.num_layer, dropout=data.HP_dropout, gpu=self.gpu) # # ## attention model if self.model_type == 'transformer': self.NERmodel = NERmodel(model_type='transformer', input_dim=char_feature_dim, hidden_dim=self.hidden_dim, num_layer=self.num_layer, dropout=data.HP_dropout) self.hidden2tag = nn.Linear(480, data.label_alphabet_size + 2) self.drop = nn.Dropout(p=data.HP_dropout) self.crf = CRF(data.label_alphabet_size, self.gpu, self.device) if self.use_bert: self.bert_encoder = BertModel.from_pretrained('transformer_cpt/bert/') self.xlnet_encoder = XLNetModel.from_pretrained('transformer_cpt/chinese_xlnet_base_pytorch') self.bert_encoder_wwm = BertModel.from_pretrained('transformer_cpt/chinese_roberta_wwm_ext_pytorch/') for p in self.bert_encoder.parameters(): p.requires_grad = False # for p in self.xlnet_encoder.parameters(): # p.requires_grad = False for p in self.bert_encoder_wwm.parameters(): p.requires_grad = False if self.gpu: self.word_embedding = self.word_embedding.cuda(self.device) if self.use_biword: self.biword_embedding = self.biword_embedding.cuda(self.device) self.NERmodel = self.NERmodel.cuda(self.device) self.hidden2tag = self.hidden2tag.cuda(self.device) self.crf = self.crf.cuda(self.device) if self.use_bert: self.bert_encoder = self.bert_encoder.cuda(self.device) # self.xlnet_encoder = self.xlnet_encoder.cuda(self.device) self.bert_encoder_wwm = self.bert_encoder_wwm.cuda(self.device)
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, transformer_weights_model: str = None, num_labels: int = 2, predictions_file=None, layer_freeze_regexes: List[str] = None, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._predictions = [] self._pretrained_model = pretrained_model if 't5' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token if transformer_weights_model: # Override for RoBERTa only for now logging.info(f"Loading Transformer weights model from {transformer_weights_model}") transformer_model_loaded = load_archive(transformer_weights_model) self._transformer_model = transformer_model_loaded.model._transformer_model else: self._transformer_model = T5Model.from_pretrained(pretrained_model) self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob) if 'roberta' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token if transformer_weights_model: # Override for RoBERTa only for now logging.info(f"Loading Transformer weights model from {transformer_weights_model}") transformer_model_loaded = load_archive(transformer_weights_model) self._transformer_model = transformer_model_loaded.model._transformer_model else: self._transformer_model = RobertaModel.from_pretrained(pretrained_model) self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob) elif 'xlnet' in pretrained_model: self._padding_value = 5 # The index of the XLNet padding token self._transformer_model = XLNetModel.from_pretrained(pretrained_model) self.sequence_summary = SequenceSummary(self._transformer_model.config) elif 'albert' in pretrained_model: self._transformer_model = AlbertModel.from_pretrained(pretrained_model) self._padding_value = 0 # The index of the BERT padding token self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob) elif 'bert' in pretrained_model: self._transformer_model = BertModel.from_pretrained(pretrained_model) self._padding_value = 0 # The index of the BERT padding token self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob) else: assert (ValueError) for name, param in self._transformer_model.named_parameters(): if layer_freeze_regexes and requires_grad: grad = not any([bool(re.search(r, name)) for r in layer_freeze_regexes]) else: grad = requires_grad if grad: param.requires_grad = True else: param.requires_grad = False transformer_config = self._transformer_model.config transformer_config.num_labels = num_labels self._output_dim = self._transformer_model.config.hidden_size # unifing all model classification layer self._classifier = Linear(self._output_dim, num_labels) self._classifier.weight.data.normal_(mean=0.0, std=0.02) self._classifier.bias.data.zero_() self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._debug = -1
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, probe_type: str = None, layer_freeze_regexes: List[str] = None, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._pretrained_model = pretrained_model if 'roberta' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token self._transformer_model = RobertaModel.from_pretrained( pretrained_model) self._dropout = torch.nn.Dropout( self._transformer_model.config.hidden_dropout_prob) elif 'xlnet' in pretrained_model: self._padding_value = 5 # The index of the XLNet padding token self._transformer_model = XLNetModel.from_pretrained( pretrained_model) self.sequence_summary = SequenceSummary( self._transformer_model.config) elif 'albert' in pretrained_model: self._transformer_model = AlbertModel.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token self._dropout = torch.nn.Dropout( self._transformer_model.config.hidden_dropout_prob) elif 'bert' in pretrained_model: self._transformer_model = BertModel.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token self._dropout = torch.nn.Dropout( self._transformer_model.config.hidden_dropout_prob) else: assert (ValueError) if probe_type == 'MLP': layer_freeze_regexes = ["embeddings", "encoder"] for name, param in self._transformer_model.named_parameters(): if layer_freeze_regexes and requires_grad: grad = not any( [bool(re.search(r, name)) for r in layer_freeze_regexes]) else: grad = requires_grad if grad: param.requires_grad = True else: param.requires_grad = False transformer_config = self._transformer_model.config transformer_config.num_labels = 1 self._output_dim = self._transformer_model.config.hidden_size # unifing all model classification layer self._classifier = Linear(self._output_dim, 1) self._classifier.weight.data.normal_(mean=0.0, std=0.02) self._classifier.bias.data.zero_() self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._debug = 2
def __init__(self, vocab: Vocabulary, model_name: str, k=12, output_dim=1, freeze_embeddings=False, temperature=1, train_with_regular_softmax=False, use_similarity=False, pass_probabilities_to_classifier=False, use_straight_through_gumbel_softmax=False, anneal_temperature=False, train_generator=True, use_kld_loss=False, generate_until_dot=False, lm_loss_coeff=1, use_cls=False, pass_only_generated=False, sim_coeff=1, dropout=0.1, train_with_just_sim_loss_for_epochs_num=-1, decouple_gen_and_cls_embs=False, initializer: InitializerApplicator = InitializerApplicator(), load_weights=False, zero_generated_out=False, output_several_results_on_every_step=False, results_each_step=0, use_repetition_loss=False, sequence_ngram_n=1, rep_coeff=1, use_similarity_btw_question_and_answers=False, anneal_repetition_loss=False, anneal_kld_loss=False, add_cls_after_epoch_num=-1, train_lm_generator=False, gen_lm_loss_coeff=1, train_cls_without_lm_loss=False): super(GeneralGenerationForClassfiication, self).__init__(vocab) self.gen_model = XLNetLMHeadModel.from_pretrained(model_name, dropout=dropout) self.tokenizer = XLNetTokenizer.from_pretrained(model_name) self.gen_word_embedding = self.gen_model.transformer.word_embedding self.gen_embeddings_weight = self.gen_word_embedding.weight if use_cls: self.cls_model = XLNetModel.from_pretrained(model_name) self.cls_word_embedding = self.cls_model.word_embedding self.cls_embeddings_weight = self.cls_word_embedding.weight if use_kld_loss: self.freezed_lm = XLNetLMHeadModel.from_pretrained(model_name) self.freezed_lm.requires_grad_(False) n_embd = 768 if 'base' in model_name else 1024 self.cls = nn.Linear(n_embd, output_dim, bias=True) self.use_cls = use_cls self.use_similarity = use_similarity self.train_generator = train_generator self.dropout = nn.Dropout(dropout) self.k = k self.use_kld_loss = use_kld_loss self.lm_loss_coeff = lm_loss_coeff self.anneal_kld_loss = anneal_kld_loss self.sim_coeff = sim_coeff self.use_repetition_loss = use_repetition_loss self.rep_coeff = rep_coeff self.anneal_repetition_loss = anneal_repetition_loss self.sequence_ngram_n = sequence_ngram_n if freeze_embeddings: self.gen_embeddings_weight.requires_grad = False self.gen_word_embedding.requries_grad_(False) if not train_generator: self.gen_model.requires_grad_(False) self.gen_embeddings_weight.requires_grad = False generate_until_dot = True self.temperature = temperature self.train_with_regular_softmax = train_with_regular_softmax self.use_straight_through_gumbel_softmax = use_straight_through_gumbel_softmax self.anneal_temperature = anneal_temperature self.topk_gs = output_several_results_on_every_step self.results_each_step = results_each_step self.generate_until_dot = generate_until_dot self.pass_only_generated = pass_only_generated self.train_with_just_sim_loss_for_epochs_num = train_with_just_sim_loss_for_epochs_num self.add_cls_after_epoch_num = add_cls_after_epoch_num self.use_similarity_btw_question_and_answers = use_similarity_btw_question_and_answers self.decouple_gen_and_cls_embs = decouple_gen_and_cls_embs self.pass_probabilities_to_classifier = pass_probabilities_to_classifier self.zero_generated_out = zero_generated_out self.supervised_generator = train_lm_generator self.gen_lm_loss_coeff = gen_lm_loss_coeff self.train_cls_without_sup_gen = train_cls_without_lm_loss if load_weights: initializer(self) self.metrics = { "accuracy": CategoricalAccuracy(), "sim_accuracy": CategoricalAccuracy(), "kld_loss": Average(), "repetition_loss": Average(), "classification_loss": Average(), "similarity_loss": Average(), }