def build_corpus(args): logger = logging.getLogger(args.logger_name) cls_add_idx = 20000 if args.mode != "base" else None # Load tokenizer old = "_old" if args.mode == "base" else "" tokenizer_path = os.path.join(args.data_dir, f"tokenizer{old}") if os.path.exists(os.path.join(tokenizer_path, "tokenizer_config.json")) and not args.rebuild_corpus: tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_path) else: tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') tokenizer = add_cls_token_to_tokenizer(tokenizer, args.logger_name, add_idx=cls_add_idx) tokenizer.save_pretrained(tokenizer_path) assert tokenizer.cls_token_id == cls_add_idx or tokenizer.cls_token_id == len(tokenizer) - 1 if args.mode == "moses": postfix = "_moses" elif args.mode != "base": postfix = "_move" else: postfix = "" cached_file = os.path.join(args.data_dir, f"data_cache_{args.corpus_pct}pct{postfix}.pt") if os.path.exists(cached_file) and not args.rebuild_corpus: # Load cached dataset logger.info(f"Loading cached dataset {cached_file}") corpus = torch.load(cached_file) if getattr(corpus, "use_moses", -1) == -1: corpus.set_use_moses(args.mode == "moses") for split in Splits: if len(corpus.data[split]) == 0: logger.warning(f"The split {split} does not contain data!") else: # Load data logger.info(f"Selecting {args.corpus_pct}% of the CNN/DM dataset as corpus") cnndm_train = get_cnndm_dataset(Splits.TRAIN, args.corpus_pct) cnndm_valid = get_cnndm_dataset(Splits.VALID, args.corpus_pct) cnndm_test = get_cnndm_dataset(Splits.TEST, args.corpus_pct) logger.info("=" * 100) # Build corpus corpus = Corpus(tokenizer, args.mode == "moses") start_time = time.time() corpus.encode_dataset(cnndm_train, split=Splits.TRAIN) corpus.encode_dataset(cnndm_valid, split=Splits.VALID) corpus.encode_dataset(cnndm_test, split=Splits.TEST) elapsed = time.time() - start_time logger.info(f"Elapsed time for encoding {(elapsed / 60):5.2f} min") logger.info(f"Saving corpus to '{cached_file}'") torch.save(corpus, cached_file) corpus.tokenizer = tokenizer assert corpus.use_moses == (args.mode == "moses") return corpus
def tokenize_samples(genes): k= len(genes[0][0]) if k==2: kmer_filepath = '/Users/camillo_stuff/Downloads/fourmersXL.txt' elif k==6: kmer_filepath = '/Users/camillo_stuff/Downloads/hexamersXL.txt' tokenizer=TransfoXLTokenizer(vocab_file=kmer_filepath) print("TOKENIZER LENGTH", len(tokenizer)) seq_ids = [tokenizer.convert_tokens_to_ids(gene) for gene in genes] return seq_ids
def __init__(self): super().__init__() self.tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103", eos_token='<eos>') self.tokenizer.add_special_tokens({'bos_token': '<sos>'}) self.model = TransfoXLLMHeadModel.from_pretrained("transfo-xl-wt103") self.softmax = nn.Softmax(dim=0)
def test_transfoxl(self): for tokenizer_name in TransfoXLTokenizer.pretrained_vocab_files_map["pretrained_vocab_file"].keys(): tokenizer_p = TransfoXLTokenizer.from_pretrained(tokenizer_name) tokenizer_r = TransfoXLTokenizerFast.from_pretrained(tokenizer_name) # Check we have the same number of added_tokens for both pair and non-pair inputs. self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False)) self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True)) # Check we have the correct max_length for both pair and non-pair inputs. self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) # Assert the set of special tokens match. self.assertSequenceEqual( tokenizer_p.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(), "TransfoXL tokenizers doesn't have the same set of special_tokens", ) # Assure tokenization overlap between python and rust impl. self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0) # Ensure add_tokens and add_special_tokens return the correct vocab size self.assert_add_tokens(tokenizer_r) # Check for offsets mapping self.assert_offsets_mapping(tokenizer_r) # Check for dynamic encoding sequence handling in batch_encode_plus self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r) # Check alignment for build_inputs_with_special_tokens self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
def __init__(self): super(Model, self).__init__() self.config = TransfoXLConfig( vocab_size_or_config_json_file=len(vocab) + 267735, n_heads=8, n_layers=9) self.model = TransfoXLModel(self.config) self.tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') self.out_layer = torch.nn.Linear(self.model.d_model, 2)
def __init__(self, device='cpu'): tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') model = model.to(device) self.tokenizer = tokenizer self.model = model.eval() self.device = device self.NUM_CLASSES = 267735
def get_IEMOCAP_loaders_transfo_xl(dataset_name = 'IEMOCAP', batch_size=32, num_workers=0, pin_memory=False, args = None): tokenizer = TransfoXLTokenizer.from_pretrained(args.home_dir + args.bert_tokenizer_dir) print('building vocab.. ') speaker_vocab, label_vocab, person_vec = load_vocab(dataset_name) train_data, dev_data, test_data = read_datas(dataset_name, batch_size) print('building datasets..') trainsets = [IEMOCAPDataset_transfo_xl(d, speaker_vocab, label_vocab, args, tokenizer) for d in train_data] devsets = [IEMOCAPDataset_transfo_xl(d, speaker_vocab, label_vocab, args, tokenizer) for d in dev_data] testsets = [IEMOCAPDataset_transfo_xl(d, speaker_vocab, label_vocab, args, tokenizer) for d in test_data] return trainsets, devsets, testsets, speaker_vocab, label_vocab, person_vec
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name log.info('In add_transformers_vocab') log.info(tokenizer_name) if tokenizer_name.startswith( "bert-" ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name: tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith( "roberta-"): # or 'roberta' in tokenizer_name: tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name: tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-roberta"): tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def run_TFXL_RSA(stim_file, layer, header=False, filter_file=None): EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE) #Get tokenizer tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') #Load model model = TransfoXLModel.from_pretrained( 'transfo-xl-wt103', output_hidden_states=True) #, force_download=True) #turn off learning model.zero_grad() for x in range(len(EXP.SENTS)): sentences = list(EXP.SENTS[x]) target = sentences[0] sentence = sentences[1] #GET BASELINE target_encoded = tokenizer.encode(target, add_special_tokens=True) target_input_ids = torch.tensor(target_encoded).unsqueeze(0) #Get model outputs output = model(target_input_ids) predictions, mems, hidden_states = output hidden_states = hidden_states[1:] baseline = hidden_states[layer][0][-1].data.cpu().squeeze() #GET SIMs sims = get_TFXL_sims(sentence, layer, baseline, tokenizer, model) values = get_dummy_values(sentence) EXP.load_IT('tfxl', x, values, False, sims) return EXP
def setup_transfo_xl(model_name): def _fix_tokenizer_encoding(tokenizer): import collections if '–' not in tokenizer.sym2idx: tokenizer.idx2sym = [sym.encode('latin1').decode( 'utf-8') for sym in tokenizer.idx2sym] tokenizer.sym2idx = collections.OrderedDict((sym.encode('latin1').decode('utf-8'), idx) for sym, idx in tokenizer.sym2idx.items()) else: logger.info("No need to fix tokenizer encoding") return tokenizer model = TransfoXLLMHeadModel.from_pretrained(model_name) tokenizer = TransfoXLTokenizer.from_pretrained(model_name) tokenizer = _fix_tokenizer_encoding(tokenizer) def encode(lines): # TODO: tokenize is removing the empty lines and add_eos is not being added. # TODO2: tokenize in transformers xl does not handle multiple lines correctly (removes <eos>) return tokenizer.convert_tokens_to_ids( [tok for l in lines for tok in tokenizer._tokenize(l.strip(), add_eos=True)]) tokenizer.encode = encode return model, tokenizer
XLM = ModelInfo( XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024', return_dict=True), XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM") T5 = ModelInfo( T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True), T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5") Albert = ModelInfo( AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True), AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert") TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'), TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_", vocab, "TXL") if __name__ == "__main__": sentences = [sample_sentences("sentences4lara.txt") for i in range(11)] sent_dict = dict(zip([str(x) for x in range(1, 11)], sentences)) sentence = sent_dict[sys.argv[2]] batch_size = 100 convergence_criterion = int(sys.argv[4]) model_list = [GPT2, Roberta, Albert, XLM, T5] max_length = 8 top_k = 25
def main(): print('start of main') parser = argparse.ArgumentParser( description='''This script computes probabilities for a masked token with words from the words file, and stores result in csv format to the output file ''') parser.add_argument("-s", type=str, required=True, dest="sent_type", help='class name: "sv_agreement" or "anaphora"') parser.add_argument("-t", type=str, required=True, dest="template", help='template name (see templates.txt)') parser.add_argument("-g", type=int, required=False, default=None, dest="gpu_num", help='which gpu to run this on') parser.add_argument("-m", type=str, required=False, default='transfo-xl-wt103', dest="model_path_or_name", help='path to the model or name of the model') args = parser.parse_args() if args.sent_type not in ['sv_agreement', 'anaphora']: parser.error("invalid sent_type argument for -s") print('creating results path') use_wug = args.model_path_or_name != 'transfo-xl-wt103' number = None if use_wug: model_type = args.model_path_or_name.split('/') if model_type[-1] == '': model_type = model_type[:-1] number = model_type[-3].lower() model_path = '/'.join(model_type[-3:]) results_path = FINE_TUNE_RESULTS_PATH[:-7] % model_path if not os.path.isdir(results_path): print('creating directory %s' % results_path) os.mkdir(results_path) results_path = FINE_TUNE_RESULTS_PATH[:-4] % (model_path, args.sent_type) if not os.path.isdir(results_path): print('creating directory %s' % results_path) os.mkdir(results_path) results_path = FINE_TUNE_RESULTS_PATH % (model_path, args.sent_type, args.template) else: results_path = RESULTS_PATH[:-4] % args.sent_type if not os.path.isdir(results_path): print('creating directory %s' % results_path) os.mkdir(results_path) results_path = RESULTS_PATH % (args.sent_type, args.template) results_filename = RESULTS_FILENAME % args.template outfilename = os.path.join(str(ABS_PATH), results_path, results_filename) if not os.path.isdir(results_path): print('creating directory %s' % results_path) os.mkdir(results_path) print('getting consts') sent_types = csp_consts.SENT_TYPES[args.sent_type] batch_sizes = csp_consts.BATCH_SIZES[args.sent_type] try: template_name = sent_types[args.template] batch_size_dict = batch_sizes[args.template] except KeyError: parser.error("Incompatible template for the given sentence type") sys.exit() print('loading model at', datetime.now()) txl_tokenizer = TransfoXLTokenizer.from_pretrained(MODEL_NAME) txl_tokenizer.add_special_tokens({ 'bos_token': BOS_TOKEN, 'pad_token': PAD_TOKEN }) txl_model = TransfoXLLMHeadModel.from_pretrained(MODEL_NAME) txl_model.eval() if args.gpu_num is not None: device = torch.device( 'cuda:' + str(args.gpu_num) if torch.cuda.is_available() else 'cpu') print('running on GPU: %d' % args.gpu_num) else: device = torch.device('cpu') txl_model.to(device) PADDING_TEXT_TXL_TOKENIZED = txl_tokenizer.encode(PADDING_TEXT, add_eos=True) PADDING_TEXT_TENSOR = torch.tensor(PADDING_TEXT_TXL_TOKENIZED, dtype=torch.long, device=device).unsqueeze(0) global PADDING_MEMS _, PADDING_MEMS = txl_model(PADDING_TEXT_TENSOR) batch_size = batch_size_dict['pairs'] num_sents = batch_size_dict['sents'] if use_wug: batch_size *= 2 num_sents //= 2 print('starting all computations at', datetime.now()) eval_from_file(txl_model, txl_tokenizer, template_name, outfilename, batch_size, num_sents, device=device, use_wug=use_wug, number=number) print('completed all computations at', datetime.now())
def train(config): train_data = open(config.TRAIN_FNAME).readlines() val_data = open(config.VAL_FNAME).readlines() test_data = open(config.TEST_FNAME).readlines() tokenizer = TransfoXLTokenizer.from_pretrained(config.TOKENIZER_FNAME) tokenize = functools.partial(tokenizer.encode, add_space_before_punct_symbol=True) train_data = list(map(tokenize, train_data)) val_data = list(map(tokenize, val_data)) test_data = list(map(tokenize, test_data)) train_dataloader = utils.DataGenerator( train_data, batch_size=config.BATCH_SIZE, max_len=config.MAX_SEQ_LEN, n_seg_splits=config.N_SEG_SPLITS, max_seg_len=config.MAX_SEG_LEN, ) val_dataloader = utils.DataGenerator( val_data, batch_size=config.BATCH_SIZE, max_len=config.MAX_SEQ_LEN, n_seg_splits=config.N_SEG_SPLITS, max_seg_len=config.MAX_SEG_LEN, ) test_dataloader = utils.DataGenerator( test_data, batch_size=config.BATCH_SIZE, max_len=config.MAX_SEQ_LEN, n_seg_splits=config.N_SEG_SPLITS, max_seg_len=config.MAX_SEG_LEN, ) model = transformer_model.create_model( max_len=config.MAX_SEQ_LEN, lstm_dim=config.LSTM_DIM, hidden_dim=config.HIDDEN_DIM, dropout_rate=config.DROPOUT_RATE, train_embeddings=config.TRAIN_EMBED, ) model.summary() model = utils.compile_model(model) model = utils.load_weights(model, config.MODEL_WEIGHTS_PATH) callbacks = utils.load_callbacks(**config.CALLBACK_PARAMS) history = model.fit( train_dataloader, validation_data=val_dataloader, steps_per_epoch=config.EPOCH_LEN, validation_steps=config.VAL_LEN, epochs=config.N_EPOCHS, callbacks=[callbacks], ) hist_df = pd.DataFrame(history.history) hist_df.to_json(f"{config.MODEL_LOGS_PATH}/history.json") test_eval_results = utils.eval_model(model, test_dataloader, thresh=0.5, steps=None) eval_results_path = f"{config.MODEL_LOGS_PATH}/eval_results.json" json.dump(test_eval_results, open(eval_results_path, "w+"))
def test_transformer_xl_embeddings(): transfo_model: str = "transfo-xl-wt103" tokenizer = TransfoXLTokenizer.from_pretrained(transfo_model) model = TransfoXLModel.from_pretrained( pretrained_model_name_or_path=transfo_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize(s + "<eos>") print(tokens) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 # # 'Berlin', 'and', 'Munich', 'have', 'a', 'lot', 'of', 'puppeteer', 'to', 'see', '.', '<eos>' # | | | | | | | | | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence(sentence: str, layers: str = "1", use_scalar_mix: bool = False) -> Sentence: embeddings = TransformerXLEmbeddings( pretrained_model_name_or_path=transfo_model, layers=layers, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence sentence = embed_sentence(sentence=s) first_token_embedding_ref = first_layer[0].tolist() first_token_embedding_actual = sentence.tokens[0].embedding.tolist() puppeteer_embedding_ref = first_layer[7].tolist() puppeteer_embedding_actual = sentence.tokens[7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert puppeteer_embedding_ref == puppeteer_embedding_actual # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", layers="1,2,3,4") ref_embedding_size = 4 * model.d_embed actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence(sentence="Berlin", layers="1,2,3,4", use_scalar_mix=True) ref_embedding_size = 1 * model.d_embed actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
def main(): parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') parser.add_argument('--model_name', type=str, default='transfo-xl-wt103', help='pretrained model name') parser.add_argument('--split', type=str, default='test', choices=['all', 'valid', 'test'], help='which split to evaluate') parser.add_argument('--batch_size', type=int, default=10, help='batch size') parser.add_argument('--tgt_len', type=int, default=128, help='number of tokens to predict') parser.add_argument('--ext_len', type=int, default=0, help='length of the extended context') parser.add_argument('--mem_len', type=int, default=1600, help='length of the retained previous heads') parser.add_argument('--clamp_len', type=int, default=1000, help='max positional embedding index') parser.add_argument('--no_cuda', action='store_true', help='Do not use CUDA even though CUA is available') parser.add_argument('--work_dir', type=str, required=True, help='path to the work_dir') parser.add_argument('--no_log', action='store_true', help='do not log the eval result') parser.add_argument('--same_length', action='store_true', help='set same length attention with masking') parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() assert args.ext_len >= 0, 'extended context length must be non-negative' if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") logger.info("device: {}".format(device)) # Load a pre-processed dataset # You can also build the corpus yourself using TransfoXLCorpus methods # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax # and tokenizing the dataset # The pre-processed corpus is a convertion (using the conversion script ) tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name) corpus = TransfoXLCorpus.from_pretrained(args.model_name) ntokens = len(corpus.vocab) va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) # Load a pre-trained model model = TransfoXLLMHeadModel.from_pretrained(args.model_name) model = model.to(device) logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) model.reset_length(args.tgt_len, args.ext_len, args.mem_len) if args.clamp_len > 0: model.clamp_len = args.clamp_len if args.same_length: model.same_length = True ############################################################################### # Evaluation code ############################################################################### def evaluate(eval_iter): # Turn on evaluation mode which disables dropout. model.eval() total_len, total_loss = 0, 0. start_time = time.time() with torch.no_grad(): mems = None for idx, (data, target, seq_len) in enumerate(eval_iter): ret = model(data, lm_labels=target, mems=mems) loss, _, mems = ret loss = loss.mean() total_loss += seq_len * loss.item() total_len += seq_len total_time = time.time() - start_time logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format( total_time, 1000 * total_time / (idx+1))) return total_loss / total_len # Run on test data. if args.split == 'all': test_loss = evaluate(te_iter) valid_loss = evaluate(va_iter) elif args.split == 'valid': valid_loss = evaluate(va_iter) test_loss = None elif args.split == 'test': test_loss = evaluate(te_iter) valid_loss = None def format_log(loss, split): log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( split, loss, math.exp(loss)) return log_str log_str = '' if valid_loss is not None: log_str += format_log(valid_loss, 'valid') if test_loss is not None: log_str += format_log(test_loss, 'test') logger.info('=' * 100) logger.info(log_str) logger.info('=' * 100)
def test_transfoxl(self): for tokenizer_name in TransfoXLTokenizer.pretrained_vocab_files_map[ "pretrained_vocab_file"].keys(): tokenizer_p = TransfoXLTokenizer.from_pretrained(tokenizer_name) tokenizer_r = TransfoXLTokenizerFast.from_pretrained( tokenizer_name) # Check we have the same number of added_tokens for both pair and non-pair inputs. self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False)) self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True)) # Check we have the correct max_length for both pair and non-pair inputs. self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) # Assert the set of special tokens match. self.assertSequenceEqual( tokenizer_p.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(), "TransfoXL tokenizers doesn't have the same set of special_tokens", ) # Assure tokenization overlap between python and rust impl. self.assert_tokenization_python_rust_almost_equals( tokenizer_p, tokenizer_r, 0.0) # Ensure add_tokens and add_special_tokens return the correct vocab size self.assert_add_tokens(tokenizer_r) # Check for offsets mapping self.assert_offsets_mapping(tokenizer_r) # Check for dynamic encoding sequence handling in batch_encode_plus self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r) # Check alignment for build_inputs_with_special_tokens self.assert_build_inputs_with_special_tokens( tokenizer_r, tokenizer_p) # Check for padding self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p) # Check the number of returned files for save_vocabulary # TransfoXL tokenizers comes in a special format which is not compatible at all # with rust tokenizers. We ensure the errors detection at correctly raised tokenizer_r_files = tokenizer_r.save_pretrained(".") self.assertSequenceEqual(tokenizer_r_files, [ "./vocab.json", "./special_tokens_map.json", "./added_tokens.json" ]) # Check loading Python-tokenizer save through Rust doesnt work (and the opposite) self.assertRaises(ValueError, tokenizer_p.from_pretrained, *tokenizer_r_files) self.assertRaises(ValueError, tokenizer_r.from_pretrained, *tokenizer_p.save_pretrained(".")) # Check loading works for Python to Python and Rust to Rust # Issue: https://github.com/huggingface/transformers/issues/3000 # self.assertIsNotNone(tokenizer_p.__class__.from_pretrained('./')) self.assertIsNotNone(tokenizer_r.__class__.from_pretrained("./"))
#flatten_train = [word for sublist in words_train for word in sublist] #flatten_dev = [word for sublist in words_dev for word in sublist] #flatten_test = [word for sublist in words_test for word in sublist] # Generate a distribution over tags, useful for control task #dist = find_distribution(data.DataLoader(POSDataset(train_x, train_y), batch_size=1)) #print(len(dist)) #ypos_train_control, ypos_dev_control, ypos_test_control = save_or_load_pos_controls( # train_x, train_y, [flatten_train, flatten_dev, flatten_test], dist) # return train_x, train_y, \ dev_x, dev_y, \ test_x, test_y # Load # Transformer XL from transformers import TransfoXLTokenizer, TransfoXLModel transfo_XL = TransfoXLModel.from_pretrained('transfo-xl-wt103') print("I have loaded the transformer XL model") transfo_XL_tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') print("I have loaded the transformer XL Tokenizer") # Build TransformerXL import warnings warnings.filterwarnings(action='ignore') train_x_transfo_XL, train_y_transfo_XL, \ dev_x_transfo_XL, dev_y_transfo_XL, \ test_x_transfo_XL, test_y_transfo_XL = get_transformer_reps(transfo_XL, transfo_XL_tokenizer, extra_transformer='TransformerXL')
def load_tokenizer(config): tokenizer = TransfoXLTokenizer.from_pretrained(config.TOKENIZER_FNAME) return tokenizer
def train(datapath, outpath, seed, batch_size, epochs, save_steps, use_gpt, use_cuda = True): #set up model and device (hopefully cuda) device = torch.device("cuda" if torch.cuda.is_available() and use_cuda else "cpu") if use_gpt: model = GPT2LMHeadModel.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2') else: model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model.to(device) optimizer = torch.optim.Adam(model.parameters(), betas=(.9,.98), eps=1e-09) #setup rng seeds on all devices to ensure repeatable results np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) num_batches = len(os.listdir(datapath)) / batch_size batch_list = getBatch(datapath, batch_size, tokenizer) avg_losses = [] avg_loss = 0 model.zero_grad() timestamp = datetime.datetime.now().strftime('%y%m%d%H%M%S') for _ in trange(epochs, desc="Epochs"): for batch_num in tqdm(range(0,int(num_batches), batch_size), desc="Batches"): #setup this batch. batch = torch.tensor(next(batch_list), dtype=torch.long, device=device) inputs, labels = batch, batch inputs = inputs.to(device) labels = labels.to(device) #feed input to model to train model.train() outputs = model(input_ids=inputs, labels=labels) if not use_gpt: # loss returned from transfoXL was broken first_pad = get_first_occ(inputs[0], -1) loss = outputs[0][0][:first_pad].mean() loss = outputs[0] avg_loss += loss #update parameters loss.backward() optimizer.step() model.zero_grad() if batch_num % (batch_size * save_steps) == 0: print('CHECKPOINT') checkpoint_path = f"{fixpath(outpath)}{timestamp}/e{epochs}-num{batch_num}-size{batch_size}" if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(checkpoint_path) tokenizer.save_pretrained(checkpoint_path) avg = avg_loss / save_steps print(f"average loss: {avg}") avg_losses += [avg] print('finished') print(avg_losses)