def __init__(self, vocab: Vocabulary, model_name: str = "bert-base", multi_choice: bool = False): super().__init__(vocab) self._model = None self._loss = CrossEntropyLoss() self.is_multi_choice = multi_choice if model_name.startswith('bert'): if self.is_multi_choice: self._model = BertMultiChoiceMLM.from_pretrained(model_name) else: self._model = BertForMaskedLM.from_pretrained(model_name) elif 'roberta' in model_name: if self.is_multi_choice: self._model = RobertaMultiChoiceMLM.from_pretrained(model_name) else: self._model = RobertaForMaskedLM.from_pretrained(model_name) elif 'albert' in model_name: self._model = AlbertForMaskedLM.from_pretrained(model_name) elif 'xlnet' in model_name: self._model = XLNetLMHeadModel.from_pretrained(model_name) else: raise ("Riquiered model is not supported.")
def __init__(self, config, dataset): super(XLNet, self).__init__(config, dataset) self.eval_generate_num = config['eval_generate_num'] self.tokenizer = XLNetTokenizer.from_pretrained( 'xlnet-base-cased', bos_token=dataset.sos_token, eos_token=dataset.eos_token, pad_token=dataset.padding_token, unk_token=dataset.eos_token) self.configuration = XLNetConfig.from_pretrained('xlnet-base-cased') self.decoder = XLNetLMHeadModel.from_pretrained( 'xlnet-base-cased', config=self.configuration) self.decoder.resize_token_embeddings(len(self.tokenizer)) self.sos_token = dataset.sos_token self.eos_token = dataset.eos_token self.mask_token = '<mask>' self.padding_token_idx = self.tokenizer.pad_token_id self.max_seq_length = config['max_seq_length'] self.device = config["device"] self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def __init__(self, args): super().__init__() self.load_model = args.load_model if "xlnet" in args.load_model: self.tokenizer = AutoTokenizer.from_pretrained(self.load_model) self.model = XLNetLMHeadModel.from_pretrained(self.load_model, mem_len=1024).to( args.device) else: self.tokenizer = AutoTokenizer.from_pretrained(self.load_model) config = AutoConfig.from_pretrained(self.load_model) config.output_hidden_states = True self.model = AutoModelWithLMHead.from_pretrained(self.load_model, config=config).to( args.device) hidden_size = 1024 if "large" in self.load_model or self.load_model == "gpt2-medium" else 768 self.hidden2label = nn.Sequential( nn.Linear(hidden_size, hidden_size // 2), nn.Sigmoid(), nn.Linear(hidden_size // 2, 2)).to(args.device) # self.hidden2label = nn.Linear(hidden_size, 2).to(args.device) self.dropout = torch.nn.Dropout(args.dropout) self.layer = args.bert_layer self.eval() self.device = args.device self.args = args
def __init__(self, model_path='xlnet-base-cased', temperature=1.0, top_k=None, top_p=None, padding_text=None, optimize=None, device=None): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, optimize=optimize) self.model_path = model_path self.tokenizer = XLNetTokenizer.from_pretrained(model_path) # TODO: Evaluted to use mems in XLNet but the result is quite weird. self.optimize['external_memory'] = 0 self.model = XLNetLMHeadModel.from_pretrained( model_path, mem_len=self.optimize['external_memory']) self.padding_text_idxes = self.tokenizer.encode(padding_text or self.PADDING_TEXT) self.model.to(self.device) self.model.eval()
def __init__(self, model_path='xlnet-base-cased', temperature=1.0, top_k=None, top_p=None, padding_text=None, optimize=None, device=None): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, optimize=optimize) try: import transformers except ModuleNotFoundError: raise ModuleNotFoundError( 'Missed transformers library. Install transfomers by `pip install transformers`' ) self.model_path = model_path # self.tokenizer = AutoTokenizer.from_pretrained(model_path) # self.model = AutoModel.from_pretrained(model_path) # TODO: Evaluted to use mems in XLNet but the result is quite weird. self.optimize['external_memory'] = 0 self.tokenizer = XLNetTokenizer.from_pretrained(model_path) self.model = XLNetLMHeadModel.from_pretrained( model_path, mem_len=self.optimize['external_memory']) self.padding_text_idxes = self.tokenizer.encode(padding_text or self.PADDING_TEXT) self.model.to(self.device) self.model.eval()
def register_model(self) -> NoReturn: """ If the model is not registered this method creates that model and places it to the model register. If the model is registered just increments model reference count. This method helps to save computational resources e.g. when combining model prediction with embedding similarity by not loading into memory same model twice. """ if self.model_name not in XLNetProbEstimator.loaded: model = XLNetLMHeadModel.from_pretrained(self.model_name) model.to(self.device) model.eval() tokenizer = XLNetTokenizer.from_pretrained(self.model_name) word2id = self._get_word2id(tokenizer) spiece_ids = [ idx for word, idx in word2id.items() if word.startswith(self.NON_START_SYMBOL) ] all_special_ids = tokenizer.all_special_ids word_embeddings = model.transformer.word_embedding.weight.data.cpu( ).numpy() XLNetProbEstimator.loaded[self.model_name] = { "model": model, "tokenizer": tokenizer, "embeddings": word_embeddings, "word2id": word2id, "spiece_ids": spiece_ids, "all_special_ids": all_special_ids, } XLNetProbEstimator.loaded[self.model_name]["ref_count"] = 1 else: XLNetProbEstimator.loaded[self.model_name]["ref_count"] += 1
def main(raw_args=None): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, required=True, help="model name e.g. xlnet-tiny-chinese") parser.add_argument("--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model") parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin") parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") args = parser.parse_args(raw_args) # model = XLNetLMHeadModel.from_pretrained( # pretrained_model_name_or_path=args.model_name, # state_dict=torch.load(args.pytorch_model_path), # cache_dir=args.cache_dir # ) model = XLNetLMHeadModel.from_pretrained( pretrained_model_name_or_path=args.cache_dir) convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
def __init__(self, config, dataset): super(XLNet, self).__init__(config, dataset) self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = XLNetTokenizer.from_pretrained( self.pretrained_model_path, bos_token=dataset.sos_token, eos_token=dataset.eos_token, pad_token=dataset.padding_token) self.sos_token = self.tokenizer.bos_token self.eos_token = self.tokenizer.eos_token self.sos_token_idx = self.tokenizer.bos_token_id self.eos_token_idx = self.tokenizer.eos_token_id self.padding_token_idx = self.tokenizer.pad_token_id self.configuration = XLNetConfig.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx, pad_token_id=self.padding_token_idx) self.decoder = XLNetLMHeadModel.from_pretrained( self.pretrained_model_path, config=self.configuration) self.decoder.resize_token_embeddings(len(self.tokenizer)) self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def config(): parser = ArgumentParser() # basic parser.add_argument('--file_dir', type=str, default=None, help="data directory") parser.add_argument('--ids_file', type=str, default=None, help="list of ids to eval") parser.add_argument('--id', type=str, default=None, help="single setting to evaluate") parser.add_argument('--parsed_file', type=str, default=None, help='') parser.add_argument('--accept_name', type=str, default='xlnet', help='bert or xlnet') args = parser.parse_args() model_name = 'xlnet-large-cased' args.tokenizer = XLNetTokenizer.from_pretrained(model_name) args.acpt_model = XLNetLMHeadModel.from_pretrained(model_name) args.device = torch.device('cuda:0') args.acpt_model.to(args.device) args.acpt_model.eval() return args
def main(train_epoch, batch_size, seq_length, lr, corpus_path, vocab_path, config_path, pretrain_model_path, output_record_path, model_save_path): seed_everything(997) num_train_epochs = train_epoch pretrain_batch_size = batch_size seq_length = seq_length lr = lr corpus_path = corpus_path vocab_path = vocab_path config_path = config_path pretrain_model_path = pretrain_model_path output_record_path = output_record_path model_save_path = model_save_path tokenizer = BertTokenizer.from_pretrained(vocab_path) # train_dataset = LineByLineTextDataset(block_size=128, file_path=corpus_path, tokenizer=tokenizer) # data = read_data(corpus_path, tokenizer) train_dataset = OppoDataset(train_file_path=corpus_path, tokenizer=tokenizer, maxlen=128) data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer) config = XLNetConfig.from_pretrained( pretrained_model_name_or_path=config_path) # model = XLNetForMaskedLM(config=config,name='./xlnet_model/pytorch_model.bin') if os.path.exists(pretrain_model_path): model = XLNetLMHeadModel.from_pretrained(pretrain_model_path, config=config) else: model = XLNetLMHeadModel(config=config) # data_collator = Collator(max_seq_len=seq_length, tokenizer=tokenizer, mlm_probability=0.15) training_args = TrainingArguments( output_dir=output_record_path, overwrite_output_dir=True, num_train_epochs=num_train_epochs, learning_rate=lr, dataloader_num_workers=8, prediction_loss_only=True, fp16=True, fp16_backend='amp', per_device_train_batch_size=pretrain_batch_size, save_strategy='no', seed=997) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) trainer.train() trainer.save_model(model_save_path)
def __init__(self): cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"' path = (subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]).decode('utf-8') self.m = MeCab.Tagger(f"-Owakati -d {path}") logger.info("mecab loaded") self.model_dir = "hajime9652/xlnet-japanese" # self.model_dir = "./backend/PyTorch" self.gen_model = XLNetLMHeadModel.from_pretrained(self.model_dir) self.gen_tokenizer = XLNetTokenizer.from_pretrained(self.model_dir)
def test_embedding_lm(self): # try original model lmmodel = XLNetLMHeadModel.from_pretrained('xlnet-base-cased') lm_outputs = lmmodel(self.input) last_hidden_states_lm = lm_outputs[ 0] # The last hidden-state is the first element of the output tuple # try our version embed_outs_lm = self.embed_model.lm(self.input) last_embedding_lm = embed_outs_lm[0] assert torch.all( torch.eq(last_embedding_lm, last_hidden_states_lm)), "LM embeddings were not the same"
def __init__(self, model_path='xlnet-base-cased', temperature=1.0, top_k=None, top_p=None, padding_text=None, device=None, return_past=False): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) self.model_path = model_path self.tokenizer = XLNetTokenizer.from_pretrained(model_path) self.model = XLNetLMHeadModel.from_pretrained(model_path) self.padding_text_idxes = self.tokenizer.encode(padding_text or self.PADDING_TEXT) self.model.to(self.device) self.model.eval() self.return_past = return_past
def run_mlm_mask_accuracy(model_name): tokenizer = AutoTokenizer.from_pretrained(model_name) # These do the same thing, except XLNet is a less popular model so it's not supported by # all AutoModel variants. if 'xlnet' in model_name: model = XLNetLMHeadModel.from_pretrained(model_name) else: model = AutoModelForMaskedLM.from_pretrained(model_name) # Make binary choice for a single sentence pair def mlm_sentence_pair(sent1, sent2): masked_toks, masked_ix, dtok1, dtok2 = get_masked_sequence( tokenizer, sent1, sent2) logit1 = model(torch.tensor([masked_toks])).logits[0, masked_ix, dtok1] logit2 = model(torch.tensor([masked_toks])).logits[0, masked_ix, dtok2] return bool(logit1 > logit2) sent_pairs = get_common_sentences() for task_name, sents in sent_pairs.items(): res = [mlm_sentence_pair(s1, s2) for (s1, s2) in sents] acc = sum(res) / len(sents) print(task_name, acc)
def test_lm_generate_xlnet_base_cased(self): model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased") input_ids = torch.Tensor( [ [ 67, 2840, 19, 18, 1484, 20, 965, 29077, 8719, 1273, 21, 45, 273, 17, 10, 15048, 28, 27511, 21, 4185, 11, 41, 2444, 9, 32, 1025, 20, 8719, 26, 23, 673, 966, 19, 29077, 20643, 27511, 20822, 20643, 19, 17, 6616, 17511, 18, 8978, 20, 18, 777, 9, 19233, 1527, 17669, 19, 24, 673, 17, 28756, 150, 12943, 4354, 153, 27, 442, 37, 45, 668, 21, 24, 256, 20, 416, 22, 2771, 4901, 9, 12943, 4354, 153, 51, 24, 3004, 21, 28142, 23, 65, 20, 18, 416, 34, 24, 2958, 22947, 9, 1177, 45, 668, 3097, 13768, 23, 103, 28, 441, 148, 48, 20522, 19, 12943, 4354, 153, 12860, 34, 18, 326, 27, 17492, 684, 21, 6709, 9, 8585, 123, 266, 19, 12943, 4354, 153, 6872, 24, 3004, 20, 18, 9225, 2198, 19, 12717, 103, 22, 401, 24, 6348, 9, 12943, 4354, 153, 1068, 2768, 2286, 19, 33, 104, 19, 176, 24, 9313, 19, 20086, 28, 45, 10292, 9, 4, 3, ] ] ).long() # In 1991, the remains of Russian Tsar Nicholas II and his family # (except for Alexei and Maria) are discovered. # The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the # remainder of the story. 1883 Western Siberia, # a young Grigori Rasputin is asked by his father and a group of men to perform magic. # Rasputin has a vision and denounces one of the men as a horse thief. Although his # father initially slaps him for making such an accusation, Rasputin watches as the # man is chased outside and beaten. Twenty years later, Rasputin sees a vision of # the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, # with people, even a bishop, begging for his blessing. """ expected_output_ids = [ 67, 2840, 19, 18, 1484, 20, 965, 29077, 8719, 1273, 21, 45, 273, 17, 10, 15048, 28, 27511, 21, 4185, 11, 41, 2444, 9, 32, 1025, 20, 8719, 26, 23, 673, 966, 19, 29077, 20643, 27511, 20822, 20643, 19, 17, 6616, 17511, 18, 8978, 20, 18, 777, 9, 19233, 1527, 17669, 19, 24, 673, 17, 28756, 150, 12943, 4354, 153, 27, 442, 37, 45, 668, 21, 24, 256, 20, 416, 22, 2771, 4901, 9, 12943, 4354, 153, 51, 24, 3004, 21, 28142, 23, 65, 20, 18, 416, 34, 24, 2958, 22947, 9, 1177, 45, 668, 3097, 13768, 23, 103, 28, 441, 148, 48, 20522, 19, 12943, 4354, 153, 12860, 34, 18, 326, 27, 17492, 684, 21, 6709, 9, 8585, 123, 266, 19, 12943, 4354, 153, 6872, 24, 3004, 20, 18, 9225, 2198, 19, 12717, 103, 22, 401, 24, 6348, 9, 12943, 4354, 153, 1068, 2768, 2286, 19, 33, 104, 19, 176, 24, 9313, 19, 20086, 28, 45, 10292, 9, 4, 3, 1722, 19, 24, 6348, 61, 977, 176, 1772, 33, 45, 970, 19, 4185, 19, 27, 442, 22, 2771, 4901, 25, 18, 2059, 20, 24, 303, 1775, 691, 9, 1147, 19, 634, 19, 43, 51, 54, 6157, 2999, 33, 4185, ] # In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) # are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, # narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin # is asked by his father and a group of men to perform magic. Rasputin has a vision and # denounces one of the men as a horse thief. Although his father initially slaps # him for making such an accusation, Rasputin watches as the man is chased outside and beaten. # Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest. # Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing. # 1990, a priest who cannot even walk with his wife, Maria, is asked to perform magic # in the presence of a local religious leader. # Since, however, he has had difficulty walking with Maria torch.manual_seed(0) output_ids = model.generate( input_ids, bos_token_id=self.special_tokens["bos_token_id"], pad_token_id=self.special_tokens["pad_token_id"], eos_token_ids=self.special_tokens["eos_token_id"], max_length=200, ) self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
import torch from transformers import XLNetTokenizer, XLNetLMHeadModel import logging logging.basicConfig(level=logging.INFO) tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = torch.tensor( tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze( 0) # We will predict the masked token perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = torch.zeros( (1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token target_mapping[ 0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) next_token_logits = outputs[ 0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] print(next_token_logits) print(next_token_logits.shape) predicted_index = torch.argmax(next_token_logits).item() print(predicted_index) predicted_token = tokenizer.convert_ids_to_tokens([predicted_index]) print(predicted_token)
def _get_masked_language_model(self): """ Initializes the XLNetLMHeadModel transformer """ self.mlm = XLNetLMHeadModel.from_pretrained(self.model) self.mlm.eval()
sentence_best_word_probs.append(best_word_prob) best_words.append( model_tokenizer.convert_ids_to_tokens( predicted_prob.argmax().item())) return (sentence_word_probs, sentence_best_word_probs, best_words) ###################################################### ### Compute XLNet scores ###################################################### for XLNET_MODEL in tqdm(['xlnet-base-cased', 'xlnet-large-cased']): model_tokenizer = XLNetTokenizer.from_pretrained(XLNET_MODEL) model = XLNetLMHeadModel.from_pretrained(XLNET_MODEL) if torch.cuda.is_available(): model = model.cuda() model = model.eval() for dial in tqdm(itertools.chain(convai1_data, convai2_data), total=convai_data_len): utterances = dial['utterances'] sentences_word_probs = list() sentences_best_word_probs = list() sentences_best_words = list() for u1, u2 in zip(utterances[:-1], utterances[1:]):
# %% from transformers import RobertaTokenizer, RobertaForMaskedLM from transformers import ElectraTokenizer, ElectraForMaskedLM from transformers import BartTokenizer, BartForConditionalGeneration from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM from transformers import XLNetTokenizer, XLNetLMHeadModel import torch import string from transformers import BertTokenizer, BertForMaskedLM bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval() xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased').eval() xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() bart_tokenizer = BartTokenizer.from_pretrained('bart-large') bart_model = BartForConditionalGeneration.from_pretrained('bart-large').eval() electra_tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-small-generator') electra_model = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator').eval() roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval()
args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model_path = f'../checkpoints/xlnet_maskedlm/{args.dataset}' args.model_path = model_path if args.generate_mode == 0: print('construct data with masked lm.') elif args.generate_mode == 1: print('construct data with random sampling.') else: print('construct data with masked lm and random sampling.') try: # load the pre-trained model and tokenizer tokenizer = XLNetTokenizer.from_pretrained(args.model_path) model = XLNetLMHeadModel.from_pretrained(args.model_path) print('Initialize XLNet from checkpoint {}.'.format(args.model_path)) except: tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased') print('Initialize XLNet with default parameters.') model.eval() model.to('cuda') for mode in ['validation', 'train']: if mode == 'train': dataset_size = args.train_dataset_size else: dataset_size = args.validation_dataset_size
parser.add_argument('--data_start', type=float, default=0, help='start point of data in 0-1 for DUC or TAC') parser.add_argument('--data_end', type=float, default=1, help='end point of data in 0-1 for DUC or TAC') parser.add_argument('--save_freq', type=int, default=1) parser.add_argument('--debug', action='store_true') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() # XLNet models tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model) model = XLNetLMHeadModel.from_pretrained(args.xlnet_model) if args.gpu_parallel: model = nn.DataParallel(model).cuda() else: cuda_dev = torch.device('cuda:{}'.format(args.gpu_id)) model = model.cuda(cuda_dev) model.train(False) # spaCy: used for merge noun chunks & name entities spacy.prefer_gpu() nlp = spacy.load(args.spacy_model) def merge_entities_and_nouns(doc, ret=True): assert doc.is_parsed with doc.retokenize() as retokenizer: seen_words = set()
from transformers import XLNetTokenizer, XLNetLMHeadModel import torch import torch.nn.functional as F tokenizer = XLNetTokenizer.from_pretrained('./model/spbpe') model = XLNetLMHeadModel.from_pretrained('./model/spbpe') model.resize_token_embeddings(len(tokenizer)) tokens = tokenizer.encode("在一件申请需要分案的情<mask>") # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = torch.tensor(tokens).unsqueeze(0) # We will predict the masked token # print(input_ids) perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) # next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] # print(next_token_logits) predicted_index = torch.argmax(outputs[0][0]).item() predicted_token = tokenizer.convert_ids_to_tokens(predicted_index) print(predicted_token) tokens = tokenizer.encode("被侵害人,是因自己的人身、财产、名<mask>") # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = torch.tensor(tokens).unsqueeze(0) # We will predict the masked token
masked_lm = None mode = 0 elif args.model_name =='XLNetLMGenerate': forward_lm_path = '../checkpoints/forward_xlnet/{}'.format(args.dataset) args.forward_lm_path = forward_lm_path backward_lm_path = '../checkpoints/backward_xlnet/{}'.format(args.dataset) args.backward_lm_path = backward_lm_path masked_lm_path = '../checkpoints/xlnet_maskedlm/{}'.format(args.dataset) args.masked_lm_path = masked_lm_path forward_lm_tokenizer = XLNetTokenizer.from_pretrained(forward_lm_path) forward_lm = XLNetLMHeadModel.from_pretrained(forward_lm_path) logger.logger.info('Initialize forward XLNet LM from checkpoint {}.'.format(forward_lm_path)) # backward_lm_tokenizer = XLNetTokenizer.from_pretrained(backward_lm_path) backward_lm = XLNetLMHeadModel.from_pretrained(backward_lm_path) logger.logger.info('Initialize backward XLNet LM from checkpoint {}.'.format(backward_lm_path)) if args.generate_candidate_method==3: masked_lm =XLNetLMHeadModel.from_pretrained(masked_lm_path) logger.logger.info('Initialize masked XLNet LM from checkpoint {}.'.format(masked_lm_path)) else: masked_lm = None mode = 1 else: raise ValueError('wrong model type.')
def add_arguments(): parser = ArgumentParser() # basic parser.add_argument('--do_train', action='store_true', help="do training") parser.add_argument('--do_test', action='store_true', help="do independent test") parser.add_argument('--do_cond_test', action='store_true', help="do test for conditional generation") parser.add_argument('--input_file', type=str, default=None, help="") parser.add_argument('--dev_file', type=str, default=None, help="") parser.add_argument('--test_file', type=str, default=None, help="") parser.add_argument('--vocab_file', type=str, default=None, help="") parser.add_argument('--emb_file', type=str, default=None, help="") parser.add_argument('--output_dir', type=str, default=None, help="") parser.add_argument('--attention', action='store_true', help='whether use attention in seq2seq') parser.add_argument('--cls_attention', action='store_true', help="") parser.add_argument('--cls_attention_size', type=int, default=300, help="") # hyper-parameters parser.add_argument('--batch_size', type=int, default=32, help="") parser.add_argument('--num_epochs', type=int, default=5, help="") parser.add_argument('--learning_rate', type=float, default=0.001, help="") parser.add_argument('--enc_type', type=str, default='bi', help="") parser.add_argument('--enc_num_units', type=int, default=512, help="") parser.add_argument('--enc_layers', type=int, default=2, help="") parser.add_argument('--dec_num_units', type=int, default=512, help="") parser.add_argument('--dec_layers', type=int, default=2, help="") parser.add_argument('--epochs', type=int, default=2, help="") parser.add_argument("--max_gradient_norm", type=float, default=5.0, help="Clip gradients to this norm.") parser.add_argument('--max_to_keep', type=int, default=5, help="") parser.add_argument( '--lowest_bound_score', type=float, default=10.0, help="Stop the training once achieving the lowest_bound_score") parser.add_argument('--beam_width', type=int, default=0, help="") parser.add_argument("--num_buckets", type=int, default=5, help="Put data into similar-length buckets.") parser.add_argument("--max_len", type=int, default=50, help="Lenth max of input sentences") parser.add_argument('--tgt_min_len', type=int, default=0, help='Length min of target sentences') # training control parser.add_argument('--print_every_steps', type=int, default=1, help="") parser.add_argument('--save_every_epoch', type=int, default=1, help="") parser.add_argument( '--stop_steps', type=int, default=20000, help="number of steps of non-improve to terminate training") parser.add_argument('--total_steps', type=int, default=None, help="total number of steps for training") parser.add_argument('--random_seed', type=int, default=1, help="") parser.add_argument('--num_gpus', type=int, default=0, help="") parser.add_argument('--save_checkpoints', action='store_true', help='Whether save models while training') # classification parser.add_argument('--classification', action='store_true', help="Perform classification") parser.add_argument('--classification_model', type=str, default='RNN', help='') parser.add_argument('--output_classes', type=int, default=2, help="number of classes for classification") parser.add_argument('--output_file', type=str, default=None, help="Classification output for train set") parser.add_argument('--dev_output', type=str, default=None, help="Classification output for dev set") parser.add_argument('--test_output', type=str, default=None, help="Classification output for test set") parser.add_argument('--filter_sizes', nargs='+', default=[5, 3], type=int, help='filter sizes, only for CNN') parser.add_argument('--dropout_keep_prob', type=float, default=0.8, help='dropout, only for CNN') parser.add_argument('--bert_config_file', type=str, default=None, help='pretrained bert config file') parser.add_argument('--bert_init_chk', type=str, default=None, help='checkpoint for pretrained Bert') # adversarial attack and defence parser.add_argument('--adv', action='store_true', help="Perform adversarial attack training/testing") parser.add_argument('--cls_enc_type', type=str, default='bi', help="") parser.add_argument('--cls_enc_num_units', type=int, default=256, help="") parser.add_argument('--cls_enc_layers', type=int, default=2, help="") parser.add_argument('--gumbel_softmax_temporature', type=float, default=0.1, help="") parser.add_argument('--load_model_cls', type=str, default=None, help="Path to target classification model") parser.add_argument('--load_model_ae', type=str, default=None, help="Path to pretrained AE") parser.add_argument('--load_model', type=str, default=None, help="Trained model for testing") parser.add_argument('--load_model_pos', type=str, default=None, help="PTN attack model for testing") parser.add_argument('--load_model_neg', type=str, default=None, help="NTP attack model for testing") # balanced attack parser.add_argument('--balance', action='store_true', help="Whether balance between pos/neg attack") # label smoothing parser.add_argument('--label_beta', type=float, default=None, help='label smoother param, must be > 0.5') # use counter-fitted embedding for AE (AE embedding different from CLS embeddings) parser.add_argument('--ae_vocab_file', type=str, default=None, help='Path to counter-fitted vocabulary') parser.add_argument('--ae_emb_file', type=str, default=None, help='Path to counter-fitted embeddings') # gan auxiliary loss parser.add_argument('--gan', action='store_true', help='Whether use GAN as regularization') # conditional generation (1 or 0) parser.add_argument( '--target_label', type=int, default=None, help="Target label for conditional generation, 0 (PTN) or 1 (NTP)") # include defending parser.add_argument( '--defending', action='store_true', help="whether train C* for more robust classification models") # train defending classifier with augmented dataset parser.add_argument( '--def_train_set', nargs='+', default=[], type=str, help='Set of adversarial examples to include in adv training') # attack an AE model using the augmented classifier as the target classifier parser.add_argument( '--use_defending_as_target', action='store_true', help='Use the defending component as the target classifier') # loss control parser.add_argument('--at_steps', type=int, default=1, help='Alternative steps for GAN/Defending') parser.add_argument('--ae_lambda', type=float, default=0.8, help='weighting ae_loss+sent_loss v.s. adv_loss') parser.add_argument('--seq_lambda', type=float, default=1.0, help='weighting ae_loss v.s. sent_loss') parser.add_argument('--aux_lambda', type=float, default=1.0, help='weighting ae_loss v.s. auxiliary losses') parser.add_argument('--sentiment_emb_dist', type=str, default='avgcos', help="whether involve embedding distance as aux loss") parser.add_argument('--loss_attention', action='store_true', help="whether weight emb dist") parser.add_argument('--loss_attention_norm', action='store_true', help="whether apply minimax norm to ae_loss_attention") # copy mechanism parser.add_argument('--copy', action='store_true', help="Whether use copy mechanism") parser.add_argument('--attention_copy_mask', action='store_true', help="Whether use attention to calculate copy mask") parser.add_argument('--use_stop_words', action='store_true', help="whether mask stop words") parser.add_argument( '--top_k_attack', type=int, default=None, help= "number of words to attack in copy mechanism, only set when args.copy is set to true." ) parser.add_argument( '--load_copy_model', type=str, default=None, help="Pretrained attention layer from the bi_att model") # evaluation options parser.add_argument('--use_cache_dir', type=str, default=None, help='cache dir for use (sem) eval') parser.add_argument( '--accept_name', type=str, default=None, help="model name for acceptibility scores (xlnet), only used when set") args = parser.parse_args() if args.save_checkpoints and not os.path.exists(args.output_dir): os.mkdir(args.output_dir) vocab_size, vocab_file = input_data.check_vocab( args.vocab_file, args.output_dir, check_special_token=False if (args.classification_model == 'BERT') else True, vocab_base_name='vocab.txt') args.vocab_file = vocab_file args.vocab_size = vocab_size if args.ae_vocab_file is not None: ae_vocab_size, ae_vocab_file = input_data.check_vocab( args.ae_vocab_file, args.output_dir, check_special_token=False if (args.classification_model == 'BERT') else True, vocab_base_name='ae_vocab.txt') args.ae_vocab_size = ae_vocab_size args.ae_vocab_file = ae_vocab_file args.use_model = None if args.use_cache_dir is not None: args.use_model = USE(args.use_cache_dir) if args.accept_name is not None: if args.accept_name == 'bert': args.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) args.acpt_model = BertForMaskedLM.from_pretrained( 'bert-base-uncased') elif args.accept_name == 'xlnet': args.tokenizer = XLNetTokenizer.from_pretrained( 'xlnet-large-cased') args.acpt_model = XLNetLMHeadModel.from_pretrained( 'xlnet-large-cased') args.device = torch.device( 'cpu') if args.num_gpus == 0 else torch.device('cuda:0') args.acpt_model.to(args.device) args.acpt_model.eval() return args
print("have keypoint") model_mask.eval() model_fast.eval() model_keypoint.eval() model_mask.cuda() model_fast.cuda() model_keypoint.cuda() print("Evaled all") print("GPT2 Time") tokenizerG = GPT2Tokenizer.from_pretrained("gpt2") modelG = GPT2LMHeadModel.from_pretrained("gpt2") modelG.to("cuda") print("Done") print("XLNet Time") tokenizerX = XLNetTokenizer.from_pretrained("xlnet-base-cased") modelX = XLNetLMHeadModel.from_pretrained("xlnet-base-cased") print("BigGan Time!") from pytorch_pretrained_biggan import ( BigGAN, one_hot_from_names, truncated_noise_sample, convert_to_images, ) modelBG = BigGAN.from_pretrained("biggan-deep-256") modelX.to("cuda") print("All prep complete!") labels = { int(key): value for (key, value) in requests.get(
def mlm_accuracy(sentpairs): res = [fill_one(s1, s2) for (s1, s2) in sentpairs] return sum(res) / len(sentpairs) for task_name, sents in sent_pairs.items(): print(task_name, mlm_accuracy(sents)) # ## XLNet needs to be done differently # In[7]: model_name = 'xlnet-base-cased' tokenizer = AutoTokenizer.from_pretrained(model_name) model = XLNetLMHeadModel.from_pretrained(model_name) # In[8]: def fill_one(sent1, sent2): toks1 = tokenizer(sent1, add_special_tokens=False)['input_ids'] toks2 = tokenizer(sent2, add_special_tokens=False)['input_ids'] masked_toks = [] masked_ix = None dtok1 = None dtok2 = None for ix in range(len(toks1)): if toks1[ix] != toks2[ix]:
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = XLNetConfig() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) perplexity = math.exp(metrics["eval_loss"]) metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.push_to_hub: kwargs = { "finetuned_from": model_args.model_name_or_path, "tags": "language-modeling" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "fasta": FASTA_DATASET = True datasets = load_dataset_fasta(data_files, data_args.max_seq_length) else: if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = XLNetConfig() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = XLNetTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. tokenized_datasets = dict() for dataset_key, dataset in datasets.items(): # Tokenize encodings = tokenizer( dataset['sequences'], truncation=True, padding='max_length', # TODO get from args passed in max_length=data_args.max_seq_length, return_special_tokens_mask=True, return_token_type_ids=False, return_attention_mask=False ) torch_dataset = FastaDataset(encodings) tokenized_datasets[dataset_key] = torch_dataset # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None ) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def test_lm_generate_xlnet_base_cased(self): model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased") model.to(torch_device) input_ids = torch.tensor( [[ 67, 2840, 19, 18, 1484, 20, 965, 29077, 8719, 1273, 21, 45, 273, 17, 10, 15048, 28, 27511, 21, 4185, 11, 41, 2444, 9, 32, 1025, 20, 8719, 26, 23, 673, 966, 19, 29077, 20643, 27511, 20822, 20643, 19, 17, 6616, 17511, 18, 8978, 20, 18, 777, 9, 19233, 1527, 17669, 19, 24, 673, 17, 28756, 150, 12943, 4354, 153, 27, 442, 37, 45, 668, 21, 24, 256, 20, 416, 22, 2771, 4901, 9, 12943, 4354, 153, 51, 24, 3004, 21, 28142, 23, 65, 20, 18, 416, 34, 24, 2958, 22947, 9, 1177, 45, 668, 3097, 13768, 23, 103, 28, 441, 148, 48, 20522, 19, 12943, 4354, 153, 12860, 34, 18, 326, 27, 17492, 684, 21, 6709, 9, 8585, 123, 266, 19, 12943, 4354, 153, 6872, 24, 3004, 20, 18, 9225, 2198, 19, 12717, 103, 22, 401, 24, 6348, 9, 12943, 4354, 153, 1068, 2768, 2286, 19, 33, 104, 19, 176, 24, 9313, 19, 20086, 28, 45, 10292, 9, 4, 3, ]], dtype=torch.long, device=torch_device, ) # In 1991, the remains of Russian Tsar Nicholas II and his family # (except for Alexei and Maria) are discovered. # The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the # remainder of the story. 1883 Western Siberia, # a young Grigori Rasputin is asked by his father and a group of men to perform magic. # Rasputin has a vision and denounces one of the men as a horse thief. Although his # father initially slaps him for making such an accusation, Rasputin watches as the # man is chased outside and beaten. Twenty years later, Rasputin sees a vision of # the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, # with people, even a bishop, begging for his blessing. """ expected_output_ids = [ 67, 2840, 19, 18, 1484, 20, 965, 29077, 8719, 1273, 21, 45, 273, 17, 10, 15048, 28, 27511, 21, 4185, 11, 41, 2444, 9, 32, 1025, 20, 8719, 26, 23, 673, 966, 19, 29077, 20643, 27511, 20822, 20643, 19, 17, 6616, 17511, 18, 8978, 20, 18, 777, 9, 19233, 1527, 17669, 19, 24, 673, 17, 28756, 150, 12943, 4354, 153, 27, 442, 37, 45, 668, 21, 24, 256, 20, 416, 22, 2771, 4901, 9, 12943, 4354, 153, 51, 24, 3004, 21, 28142, 23, 65, 20, 18, 416, 34, 24, 2958, 22947, 9, 1177, 45, 668, 3097, 13768, 23, 103, 28, 441, 148, 48, 20522, 19, 12943, 4354, 153, 12860, 34, 18, 326, 27, 17492, 684, 21, 6709, 9, 8585, 123, 266, 19, 12943, 4354, 153, 6872, 24, 3004, 20, 18, 9225, 2198, 19, 12717, 103, 22, 401, 24, 6348, 9, 12943, 4354, 153, 1068, 2768, 2286, 19, 33, 104, 19, 176, 24, 9313, 19, 20086, 28, 45, 10292, 9, 4, 3, 19, 12943, 4354, 153, 27, 442, 22, 2771, 4901, 9, 69, 27, 442, 22, 2771, 24, 11335, 20, 18, 9225, 2198, 9, 69, 27, 442, 22, 2771, 24, 11335, 20, 18, 9225, 2198, 9, 69, 27, 442, 22, 2771, ] # In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) # are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, # narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin # is asked by his father and a group of men to perform magic. Rasputin has a vision and # denounces one of the men as a horse thief. Although his father initially slaps # him for making such an accusation, Rasputin watches as the man is chased outside and beaten. # Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest. # Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing. # <sep><cls>, Rasputin is asked to perform magic. He is asked to perform a ritual of the Virgin Mary. # He is asked to perform a ritual of the Virgin Mary. He is asked to perform output_ids = model.generate(input_ids, max_length=200, do_sample=False) self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
default=50, help='the max length of sentences for training language models.') parser.add_argument('--gpu', type=str, default='0') parser.add_argument('--dataset', type=str, default='one-billion-words', choices=['yelp', 'amazon', 'one-billion-words']) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu forward_model_path = '../checkpoints/forward_xlnet/{}'.format(args.dataset) backward_model_path = '../checkpoints/backward_xlnet/{}'.format( args.dataset) forward_model = XLNetLMHeadModel.from_pretrained(forward_model_path) backward_model = XLNetLMHeadModel.from_pretrained(backward_model_path) forward_tokenizer = XLNetTokenizer.from_pretrained(forward_model_path) backward_tokenizer = XLNetTokenizer.from_pretrained(backward_model_path) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) forward_model = forward_model.to(device) backward_model = backward_model.to(device) forward_testset = XLNetDataset( args.dataset, "test", tokenizer=forward_tokenizer, max_sentence_length=args.max_sentence_length,