def reload(self, bert_model, gpu): from pytorch_transformers import XLNetTokenizer, XLNetModel if bert_model.endswith('.tar.gz'): self.tokenizer = NoPickle( XLNetTokenizer.from_pretrained(bert_model.replace( '.tar.gz', '-vocab.txt'), do_lower_case=self.lower)) else: self.tokenizer = NoPickle( XLNetTokenizer.from_pretrained(bert_model, do_lower_case=self.lower)) self.xlnet = NoPickle(XLNetModel.from_pretrained(bert_model)) if gpu: self.xlnet = self.xlnet.cuda() self.output_dim = self.xlnet.d_model # self.max_len = self.xlnet.embeddings.position_embeddings.num_embeddings for p in self.xlnet.parameters(): p.requires_grad = False if self.finetune_tune_last_n > 0: self.finetune_layers = self.xlnet.encoder.layer[ -self.finetune_tune_last_n:] for p in self.finetune_layers.parameters(): p.requires_grad = True
def get_dataloader(myPath, max_len=128, batch_size=50): ## load data train_revs = get_data(myPath) ## tokenize inputs tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=False) tokenized_texts = [tokenizer.tokenize(rev) for rev in train_revs] input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post") print('tokenized inputs') # Create a mask of 1s for each token followed by 0s for padding attention_masks = [] for seq in input_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) prediction_inputs = torch.tensor(input_ids, dtype=torch.long) prediction_masks = torch.tensor(attention_masks) prediction_labels = torch.zeros([len(train_revs)], dtype=torch.long) prediction_labels[:len(train_revs) // 2] = 1 print("loaded tensors") prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) return prediction_dataloader
def __init__( self, pretrained_model_name_or_path: str = "xlnet-large-cased", layers: str = "1", pooling_operation: str = "first_last", use_scalar_mix: bool = False, ): """XLNet embeddings, as proposed in Yang et al., 2019. :param pretrained_model_name_or_path: name or path of XLNet model :param layers: comma-separated list of layers :param pooling_operation: defines pooling operation for subwords :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) """ super().__init__() self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path) self.model = XLNetModel.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, output_hidden_states=True ) self.name = pretrained_model_name_or_path self.layers: List[int] = [int(layer) for layer in layers.split(",")] self.pooling_operation = pooling_operation self.use_scalar_mix = use_scalar_mix self.static_embeddings = True dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token("hello")) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding() )
def xlnet_feature_extractor(examples): config = XLNetConfig.from_pretrained(model_name) tokenizer = XLNetTokenizer.from_pretrained(model_name) model = XLNetForFeatureExtraction(config) # input_ids = torch.tensor(tokenizer.encode(utterance_list[0])).unsqueeze(0)# Batch size 1 features = convert_examples_to_features( examples, MAX_SEQ_LEN, tokenizer, cls_token_at_end=True, # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if True else 1, pad_on_left=True, # pad on the left for xlnet pad_token_segment_id=4 if True else 0) input_ids_list = [] input_mask_list = [] segment_ids_list = [] for feature in features: input_ids_list.append(feature.input_ids) input_mask_list.append(feature.input_mask) segment_ids_list.append(feature.segment_ids) input_ids_tensor = torch.tensor(input_ids_list) input_mask_tensor = torch.tensor(input_mask_list) segment_ids_tensor = torch.tensor(segment_ids_list) transformer_outputs = model(input_ids=input_ids_tensor, attention_mask=input_mask_tensor, token_type_ids=segment_ids_tensor) feature = transformer_outputs[:, -1] return feature
def extractVocab(model_path=post_rec.XLNetBaseCased): tokenizer = XLNetTokenizer.from_pretrained(model_path) tokenizer = Seq2SeqAdapterTokenizer(tokenizer) vocab_file = os.path.join(model_path, "vocab.txt") tokens = [] id = 0 while id < 32000: try: word = tokenizer.decode([id]) except: print(id, "exceeded!") break print(word, id) tokens.append(str(id)) id += 1 with open(vocab_file, "w", encoding="utf-8") as f: tokens = map(lambda w: w + "\n", tokens) f.writelines(tokens) print("**" * 20) print("write vocab.txt")
def get_tokenizer(model_path=None, name="bert"): tokenizer = None if name == "bert": from pytorch_transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained(model_path) tokenizer = Seq2SeqAdapterTokenizer(tokenizer) if name == "gpt2": from pytorch_transformers import GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained(model_path) tokenizer = Seq2SeqAdapterTokenizer(tokenizer) if name == "xlnet": from pytorch_transformers import XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained(model_path) tokenizer = Seq2SeqAdapterTokenizer(tokenizer) if name == "roberta": tokenizer = RoBertaTokenizer(model_path) if name == "simple": tokenizer = SimpleTokenizer() if name == "spacy": tokenizer = SpacyTokenizer() if name == "corenlp": tokenizer = CoreNLPTokenizer() if tokenizer is None: raise RuntimeError("tokenizer:{} is not supported!".format(name)) return tokenizer
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def train(args, device): args.dataset_name = "MNLI" # TODO: parametrize model_name = args.model_name log = get_train_logger(args) SEED = 42 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) log.info(f'Using device {device}') tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True) xlnet_config = XLNetConfig.from_pretrained( model_name, output_hidden_states=True, output_attentions=True, num_labels=3, finetuning_task=args.dataset_name) model = XLNetForSequenceClassification.from_pretrained(model_name, config=xlnet_config) model.to(device) # Load features from datasets data_loader = MNLIDatasetReader(args, tokenizer, log) train_file = os.path.join(args.base_path, args.train_file) val_file = os.path.join(args.base_path, args.val_file) train_dataloader = data_loader.load_train_dataloader(train_file) val_dataloader = data_loader.load_val_dataloader(val_file) trainer = TrainModel(train_dataloader, val_dataloader, log) trainer.train(model, device, args)
def TextPrep(sentimentData): sentimentData.Text = sentimentData.Text.apply(nltk.tokenize.sent_tokenize) sentimentData.Text = sentimentData.Text.apply(xlnetPrep) # Turns the string into a sequence of words tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True) tokenizedText = sentimentData.Text.apply(tokenizer.tokenize) MAX_LEN = 128 SEPToken = tokenizer.tokenize(' [SEP]') CLSToken = tokenizer.tokenize(' [CLS]') tokenizedText = tokenizedText.apply( lambda x: x[:MAX_LEN - 8] + SEPToken + CLSToken if len(x) > MAX_LEN - 4 else x + CLSToken) # Create token IDS input_ids = input_ids = tokenizedText.apply( tokenizer.convert_tokens_to_ids) input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post') # Attention masks mark how many tokens are in a sentence attention_masks = [] for seq in input_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) return input_ids, attention_masks
def main(): torch.cuda.empty_cache() parser = setup_parser() args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory already exists and is not empty.") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args.n_gpu = torch.cuda.device_count() args.device = device set_seed(args) args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: {}".format(args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) ##Load Models config = XLNetConfig.from_pretrained(args.config_name) print('config: {}'.format(config)) tokenizer = XLNetTokenizer.from_pretrained( args.text_encoder_checkpoint, do_lower_case=args.do_lower_case) text_encoder = XLNetModel.from_pretrained(args.text_encoder_checkpoint, config=config) graph_encoder = GraphEncoder(args.n_hidden, args.min_score) if args.graph_encoder_checkpoint: graph_encoder.gcnnet.load_state_dict( torch.load(args.graph_encoder_checkpoint)) medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1) medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5) medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden, 4) model = MedstsNet(text_encoder, graph_encoder, medsts_classifier, medsts_c_classifier, medsts_type_classifier, config) model.to(args.device) args.n_gpu = 1 if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info('global step = {}, average loss = {}'.format( global_step, tr_loss)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("saving model checkpoint to {}".format(args.output_dir)) model_to_save = model.module if hasattr(model, 'module') else model # model_to_save.save_pretrained(args.output_dir) torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, 'saved_model.pth')) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
def load(cls, vocab_file: str, cache_model: bool = True) -> XLNetTokenizer: if vocab_file in cls._cache: return PretrainedXLNetTokenizer._cache[vocab_file] model = XLNetTokenizer(vocab_file=vocab_file) if cache_model: cls._cache[vocab_file] = model return model
def __init__( self, gpu=-1, check_for_lowercase=True, embeddings_dim=0, verbose=True, path_to_pretrained="xlnet-base-cased", model_frozen=True, bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", ): SeqIndexerBaseEmbeddings.__init__( self, gpu=gpu, check_for_lowercase=check_for_lowercase, zero_digits=True, bos_token=bos_token, eos_token=eos_token, pad=pad_token, unk=unk_token, sep_token=sep_token, cls_token=cls_token, mask_token=mask_token, load_embeddings=True, embeddings_dim=embeddings_dim, verbose=verbose, isBert=False, isXlNet=True) print("create seq indexer Transformers from Model {}".format( path_to_pretrained)) self.xlnet = True self.path_to_pretrained = path_to_pretrained self.tokenizer = XLNetTokenizer.from_pretrained(path_to_pretrained) self.config = XLNetConfig.from_pretrained(path_to_pretrained) self.emb = XLNetModel.from_pretrained(path_to_pretrained) self.frozen = model_frozen for param in self.emb.parameters(): param.requires_grad = False for elem in [ self.emb.word_embedding, self.emb.layer, self.emb.dropout ]: for param in elem.parameters(): param.requires_grad = False if (not self.frozen): for param in self.emb.pooler.parameters(): param.requires_grad = True self.emb.eval() print("XLNET model loaded succesifully")
def load_tokenizer(self): if self.model_configuration.is_xlnet: self.tokenizer = XLNetTokenizer.from_pretrained(self.model_configuration.bert_model, do_lower_case=self.model_configuration.do_lower) elif not self.model_configuration.is_scibert: self.tokenizer = BertTokenizer.from_pretrained(self.model_configuration.bert_model, do_lower_case=self.model_configuration.do_lower) else: self.tokenizer = BertTokenizer(self.model_configuration.vocab_file, do_lower_case=self.model_configuration.do_lower)
def get_tokenizer(tokenizer_name): logger.info(f"Loading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("xlnet"): do_lower_case = "uncased" in tokenizer_name tokenizer = XLNetTokenizer.from_pretrained( tokenizer_name, do_lower_case=do_lower_case ) return tokenizer
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = False): super(XLNet, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.max_seq_length = max_seq_length self.do_lower_case = do_lower_case self.xlnet = XLNetModel.from_pretrained(model_name_or_path) self.tokenizer = XLNetTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
def __init__(self, language=Language.ENGLISHCASED, cache_dir="."): """Initializes the underlying pretrained XLNet tokenizer. Args: language (Language, optional): The pretrained model's language. Defaults to Language.ENGLISHCASED """ self.tokenizer = XLNetTokenizer.from_pretrained(language.value, cache_dir=cache_dir) self.language = language
def gen_dataloader(_train_path, _test_path, batch_size, preprocess_inputs=False, tokenizer_type='bert-base-uncased', input_len=128, **kwargs): """ Helper function that takes either just the train data path or both train and test data an outputs the appropriate dataloader instance kwargs are: for preprocessing: sample_size=None, weak_supervision=True max_len = 128 filter_bad_rows = True tokenizer = DFAULT_TOKENIIZER For dataloaders: val_sample_dataloader=True pin_memory = False num_workers = 0 """ if 'bert' in tokenizer_type.lower(): tokenizer = BertTokenizer.from_pretrained(tokenizer_type) elif 'xlnet' in tokenizer_type.lower(): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_type) else: raise NotImplementedError( 'model {} is not implemented'.format(tokenizer_type)) train_dataset = read_data_to_dataframe(_train_path) if preprocess_inputs: df_train = preprocess_model_inputs(train_dataset, tokenizer=tokenizer, output_len=input_len, **kwargs) else: df_train = train_dataset if _test_path: test_dataset = read_data_to_dataframe(_test_path) if preprocess_inputs: df_test = preprocess_model_inputs(test_dataset, tokenizer=tokenizer, **kwargs) else: df_test = test_dataset dl = TrainValDataloader(df_train, df_test, batch_size, kwargs) return dl dl = TrainValSplitDataloader(df_train, batch_size, kwargs) return dl
def __init__(self, model_path='xlnet-base-cased', padding_text=None, device='cuda'): super().__init__() self.model_path = model_path self.device = device self.tokenizer = XLNetTokenizer.from_pretrained(model_path) self.model = XLNetLMHeadModel.from_pretrained(model_path) self.padding_text_idxes = self.tokenizer.encode(padding_text or self.PADDING_TEXT) self.model.to(device) self.model.eval()
def run(args): nli_model_path = 'saved_models/xlnet-base-cased/' model_file = os.path.join(nli_model_path, 'pytorch_model.bin') config_file = os.path.join(nli_model_path, 'config.json') log = get_logger('conduct_test') model_name = 'xlnet-base-cased' tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True) xlnet_config = XLNetConfig.from_pretrained(config_file) model = XLNetForSequenceClassification.from_pretrained(model_file, config=xlnet_config) dataset_reader = ConductDatasetReader(args, tokenizer, log) file_lines = dataset_reader.get_file_lines('data/dados.tsv') results = [] softmax_fn = torch.nn.Softmax(dim=1) model.eval() with torch.no_grad(): for line in tqdm(file_lines): premise, hypothesys, conflict = dataset_reader.parse_line(line) pair_word_ids, input_mask, pair_segment_ids = dataset_reader.convert_text_to_features( premise, hypothesys) tensor_word_ids = torch.tensor([pair_word_ids], dtype=torch.long, device=args.device) tensor_input_mask = torch.tensor([input_mask], dtype=torch.long, device=args.device) tensor_segment_ids = torch.tensor([pair_segment_ids], dtype=torch.long, device=args.device) model_input = { 'input_ids': tensor_word_ids, # word ids 'attention_mask': tensor_input_mask, # input mask 'token_type_ids': tensor_segment_ids } outputs = model(**model_input) logits = outputs[0] nli_scores, nli_class = get_scores_and_class(logits, softmax_fn) nli_scores = nli_scores.detach().cpu().numpy() results.append({ "conduct": premise, "complaint": hypothesys, "nli_class": nli_class, "nli_contradiction_score": nli_scores[0], "nli_entailment_score": nli_scores[1], "nli_neutral_score": nli_scores[2], "conflict": conflict }) df = pd.DataFrame(results) df.to_csv('results/final_results.tsv', sep='\t', index=False)
def add_pytorch_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name if tokenizer_name.startswith("bert-"): tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token if tokenizer_name.startswith("roberta-"): if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None: vocab_size -= 1 else: log.info("Time to delete vocab_size-1 in preprocess.py !!!") # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove # this when they fix the problem ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added pytorch_transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')): super(XLNetClient, self).__init__() self.chunck_size = chunck_size self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') self.max_length = max_length # load the model self.model = XLNetModel.from_pretrained('xlnet-large-cased') self.model.eval() self.device = device # move model to device self.model.to(self.device)
def __init__(self, args): self.args = args ## self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased', do_lower_case=False) self.sep_token = '<sep>' self.cls_token = '<cls>' self.pad_token = '<pad>' self.tgt_bos = '<s>' self.tgt_eos = '</s>' self.tgt_sent_split = '<sep>' self.sep_vid = self.tokenizer.sp_model[self.sep_token] self.cls_vid = self.tokenizer.sp_model[self.cls_token] self.pad_vid = self.tokenizer.sp_model[self.pad_token]
def load_wirte_xlnet_vocab(model_name, save_vocab_path): from pytorch_transformers import XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained(model_name) # tokenizer.save_vocabulary(vocab_path) vocab_dict = OrderedDict() vocab_size = tokenizer.vocab_size for i in range(vocab_size): token = tokenizer.convert_ids_to_tokens(i) vocab_dict[i] = token print(len(vocab_dict)) print(tokenizer.vocab_size) with open(save_vocab_path, 'w') as writer: for k, v in vocab_dict.items(): writer.write('{0}\n'.format(v))
def __init__(self, opt): self.opt = opt if 'roberta' in opt.pretrained_bert_name: tokenizer = RobertaTokenizer.from_pretrained( opt.pretrained_bert_name) transformer = RobertaModel.from_pretrained( opt.pretrained_bert_name, output_attentions=True) elif 'bert' in opt.pretrained_bert_name: tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name) transformer = BertModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True) elif 'xlnet' in opt.pretrained_bert_name: tokenizer = XLNetTokenizer.from_pretrained( opt.pretrained_bert_name) transformer = XLNetModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True) if 'bert' or 'xlnet' in opt.model_name: tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len) self.model = opt.model_class(transformer, opt).to(opt.device) # elif 'xlnet' in opt.model_name: # tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len) # self.model = opt.model_class(bert,opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased', do_lower_case=False, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.sp_model['<s>'], 'EOS': tokenizer.sp_model['</s>'], 'PAD': tokenizer.sp_model['<pad>'], 'EOQ': tokenizer.sp_model['<unk>'] } valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def __init__(self, tokenizer_name="xlnet-large-cased", max_seq_len=384, doc_stride=128, max_query_len=64, is_training=True): """ Reader for squad 2 dataset Input: - tokenizer_name: string, default is xlnet_base_cased, tokenizer model name or path - max_seq_len: int, default is 384, The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded. - doc_stride: int, default is 128, When splitting up a long document into chunks, how much stride to take between chunks. - max_query_len: int, default is 64, The maximum number of tokens for the question. Questions longer than this will be truncated to this length. - is_training: bool, default is True """ self.is_training = is_training self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=True) self.max_seq_len = max_seq_len self.doc_stride = doc_stride self.max_query_len = max_query_len self.tokenizer_name = tokenizer_name
def __init__(self, args, model, vocab, symbols, global_scorer=None, logger=None, dump_beam=""): self.logger = logger self.cuda = args.visible_gpus != '-1' self.args = args self.model = model self.generator = self.model.generator self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased', do_lower_case=False, cache_dir="../temp") self.vocab = vocab self.symbols = symbols self.start_token = symbols['BOS'] self.end_token = symbols['EOS'] self.global_scorer = global_scorer self.beam_size = args.beam_size self.min_length = args.min_length self.max_length = args.max_length self.dump_beam = dump_beam # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None tensorboard_log_dir = args.model_path self.tensorboard_writer = SummaryWriter(tensorboard_log_dir, comment="Unmt") if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": [] }
def init_params(self, model_name, pre_trained_model, f_lr=5e-5, f_eps = 1e-8): MODEL_CLASSES = { "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer) } # self._config, self._model_class, self._tokenizer = MODEL_CLASSES[model_name] self._tokenizer = XLNetTokenizer.from_pretrained(pre_trained_model, do_lower_case=True) self._config = XLNetConfig.from_pretrained(pre_trained_model, do_lower_case=True) self._model = XLNetForQuestionAnswering.from_pretrained(pre_trained_model, config=self._config) self._model.to(self._device) no_decay = ['bias', 'LayerNorm.weight'] weight_decay = 0.0 # Author's default parameter optimizer_grouped_parameters = [ {'params': [p for n, p in self._model.named_parameters() \ if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in self._model.named_parameters() \ if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # warmup_steps = 0.0 self._optimizer = AdamW(optimizer_grouped_parameters, lr=f_lr, eps=f_eps)
def get_xlnet(xlnet_model): # Avoid a hard dependency on BERT by only importing it if it's being used from pytorch_transformers import (WEIGHTS_NAME, XLNetModel, XLMConfig, XLMForSequenceClassification, XLMTokenizer, XLNetConfig, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetTokenizer) print(xlnet_model) tokenizer = XLNetTokenizer.from_pretrained(xlnet_model) xlnet = XLNetLMHeadModel.from_pretrained(xlnet_model) # if bert_model.endswith('.tar.gz'): # tokenizer = BertTokenizer.from_pretrained(bert_model.replace('.tar.gz', '-vocab.txt'), do_lower_case=bert_do_lower_case) # else: # tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=bert_do_lower_case) # bert = BertModel.from_pretrained(bert_model) return tokenizer, xlnet
def __init__(self): self.vocab_root = "../data/vocab.txt" self.xlnet_config_root = "../data/config.json" self.pretrained_xlnet_root = "../data/pytorch_model.bin" self.tokenizer_root = "../data/spiece.model" self.raw_train_data_root = "../data/train.txt" self.split_train_data_root = "../data/split_train_data.json" self.raw_train_label_root = "../data/train_answer.csv" self.raw_test_data_root = "../data/dev.txt" self.model_root = "../model/" self.data_root = "../data/" self.idiom_vocab_root = "../data/idiomList.txt" self.prob_file = "../data/prob.csv" self.result_file = "../data/result.csv" self.raw_result_file = "../data/raw_result.csv" self.xlnet_learning_rate = 2e-5 self.other_learning_rate = 1e-3 self.max_seq_length = 128 self.num_train_epochs = 100 self.warmup_proportion = 0.01 self.hidden_dropout_prob = 0.5 self.num_workers = 8 self.eval_ratio = 0.02 with open(self.data_root + "idiom2index", mode="rb") as f1: self.idiom2index = pickle.load(f1) with open(self.data_root + "index2idiom", mode="rb") as f2: self.index2idiom = pickle.load(f2) self.use_gpu = t.cuda.is_available() self.device = t.device("cuda" if self.use_gpu else "cpu") self.n_gpu = t.cuda.device_count() self.train_batch_size = 10 * self.n_gpu * int(256 / self.max_seq_length) self.test_batch_size = 32 * self.n_gpu * int(256 / self.max_seq_length) self.logger = logging.getLogger("xlnetCloze_train") self.logger.setLevel(logging.INFO) self.writer = SummaryWriter('tensorlog') self.decay = 0.3 self.min_lr = 5e-7 self.patience = 1 self.seed = 42 self.show_loss_step = 200 self.version = 30 self.tokenizer = XLNetTokenizer.from_pretrained(self.tokenizer_root)