def get_from_pretrained(path): conf_path = join(dirname(path), "config.json") conf = EncoderDecoderConfig.from_pretrained(conf_path) model = EncoderDecoderModel.from_pretrained(path, config=conf) return model
show_answer(tokenizer, train_input_encodings, train_sentbounds, train_sentlabels, 0) show_answer(tokenizer, train_input_encodings, train_sentbounds, train_sentlabels, -1) print("Length of Train Set: {}".format(len(train_contents))) print("Done Dataset Processing") """The dataset is now ready for training""" train_dataset = GenerationDataset(train_input_encodings, train_sentbounds, train_sentlabels) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) if args.modelckpt != "": print("Load from modelckpt: {}".format(args.modelckpt)) model_encdec = EncoderDecoderModel.from_pretrained(args.modelckpt) else: print("Build from pretrained model: {}".format(modelbase)) model_encdec = EncoderDecoderModel.from_encoder_decoder_pretrained( modelbase, modelbase) # bert2bert, chinese variant # Create sentence scoring model for param in model_encdec.parameters(): # freeze everything param.requires_grad = False model_encoder = model_encdec.encoder model_sent_score = torch.nn.Sequential(torch.nn.Linear( 768 * 2, 2), torch.nn.LogSoftmax( dim=1)) # Pooler output + Sentence encoding via Mean-over-Position device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") model_encoder.to(device) model_encoder.eval()
def test_real_bert_model_from_pretrained(self): model = EncoderDecoderModel.from_pretrained("bert-base-uncased", "bert-base-uncased") self.assertIsNotNone(model)
"<bos>", "<eos>", "<persona>", "<speaker1>", "<speaker2>", "<pad>" ] ATTR_TO_SPECIAL_TOKEN = { 'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>', 'additional_special_tokens': ['<speaker1>', '<speaker2>', '<persona>'] } tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny") tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) encoder_decoder_config = EncoderDecoderConfig.from_pretrained( './models/checkpoint-1200') model = EncoderDecoderModel.from_pretrained('./models/checkpoint-1200', config=encoder_decoder_config) model.get_encoder().resize_token_embeddings(len(tokenizer)) model.get_decoder().resize_token_embeddings(len(tokenizer)) print(type(model.get_encoder()), type(model.get_decoder())) # model = SimpleEncoderDecoder(tokenizer) # model = load() # model.to('cpu') # create ids of encoded input vectors input_ids = tokenizer("I want to buy a car", return_tensors="pt").input_ids # create BOS token decoder_input_ids = tokenizer("<bos>", add_special_tokens=False, return_tensors="pt").input_ids
def encoder_decoder_example(): from transformers import EncoderDecoderConfig, EncoderDecoderModel from transformers import BertConfig, GPT2Config pretrained_model_name = 'bert-base-uncased' #pretrained_model_name = 'gpt2' if 'bert' in pretrained_model_name: # Initialize a BERT bert-base-uncased style configuration. config_encoder, config_decoder = BertConfig(), BertConfig() elif 'gpt2' in pretrained_model_name: config_encoder, config_decoder = GPT2Config(), GPT2Config() else: print('Invalid model, {}.'.format(pretrained_model_name)) return config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) if 'bert' in pretrained_model_name: # Initialize a Bert2Bert model from the bert-base-uncased style configurations. model = EncoderDecoderModel(config=config) #model = EncoderDecoderModel.from_encoder_decoder_pretrained(pretrained_model_name, pretrained_model_name) # Initialize Bert2Bert from pre-trained checkpoints. tokenizer = BertTokenizer.from_pretrained(pretrained_model_name) elif 'gpt2' in pretrained_model_name: model = EncoderDecoderModel(config=config) tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name) #print('Configuration of the encoder & decoder:\n{}.\n{}.'.format(model.config.encoder, model.config.decoder)) #print('Encoder type = {}, decoder type = {}.'.format(type(model.encoder), type(model.decoder))) if False: # Access the model configuration. config_encoder = model.config.encoder config_decoder = model.config.decoder # Set decoder config to causal LM. config_decoder.is_decoder = True config_decoder.add_cross_attention = True #-------------------- input_ids = torch.tensor(tokenizer.encode('Hello, my dog is cute', add_special_tokens=True)).unsqueeze(0) # Batch size 1. if False: # Forward. outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) # Train. outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids) loss, logits = outputs.loss, outputs.logits # Save the model, including its configuration. model.save_pretrained('my-model') #-------------------- # Load model and config from pretrained folder. encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model') model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config) #-------------------- # Generate. # REF [site] >> # https://huggingface.co/transformers/internal/generation_utils.html # https://huggingface.co/blog/how-to-generate generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id) #generated = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, num_return_sequences=5, do_sample=True, top_k=0, temperature=0.7, early_stopping=True, decoder_start_token_id=model.config.decoder.pad_token_id) print('Generated = {}.'.format(tokenizer.decode(generated[0], skip_special_tokens=True)))
def generate_predictions(args): model_dir = os.path.join(args.model_root_dir, args.run_id, args.translation_model_name) print(f"model dir: {model_dir}") val_data_path = os.path.join(args.data_out_dir, args.val_dataset_name) print( f"using model from {get_last_checkpoint(model_dir)} and test data from {val_data_path} to generate predictions" ) dataset_properties = json.load( open(os.path.join(model_dir, "dataset_properties.json"))) special_tokens = dataset_properties["special_tokens"] target_vocab = dataset_properties["target_vocab"] source_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") source_tokenizer.add_special_tokens( {"additional_special_tokens": special_tokens}) target_tokenizer = PreTrainedArsenalTokenizer(target_vocab=target_vocab) bert2arsenal = EncoderDecoderModel.from_pretrained( get_last_checkpoint(model_dir)) val_data = datasets.load_from_disk(val_data_path) runid, _, checkpoint = get_last_checkpoint((model_dir)).split("/")[-3:] outfile = open( os.path.join( Path(model_dir).parent, f"predictions_{runid}_{checkpoint}.txt"), "w") torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' bert2arsenal.to(torch_device) batch_size = args.batch_size num_batches = int(val_data.num_rows / batch_size) type_forcing_vocab = target_tokenizer.id2vocab if args.type_forcing else None for i in tqdm(range(num_batches)): batch_range = range(i * batch_size, (i + 1) * batch_size) batch = val_data.select(list(batch_range)) batch_ids = torch.tensor(batch["input_ids"], device=torch_device) batch_masks = torch.tensor(batch["attention_mask"], device=torch_device) # take this little detour with the args for generate() so that we can decide whether # we want to add the argument for the type forcing vocab (if using an unpatched # transformers version, adding anything about typeforcing (even if disabled) would # cause errors about unrecognized arguments) generate_args = { "input_ids": batch_ids, "attention_mask": batch_masks, "decoder_start_token_id": target_tokenizer.cls_token_id, "num_beams": args.num_beams, "num_return_sequences": args.num_outputs, "no_repeat_ngram_size": 0 } if args.type_forcing: generate_args["type_forcing_vocab"] = type_forcing_vocab outputs = bert2arsenal.generate(**generate_args) # apparently batch instances and return sequences per instance are stacked along a single dimension for j in range(batch_size): input = [t for t in batch["input_ids"][j] if t != 0] true_seq = [t for t in batch['labels'][j] if t != -100] outfile.write(f"{input}\t{true_seq}") for k in range(j * args.num_outputs, (j + 1) * args.num_outputs): pred_seq = [t for t in outputs[k].tolist() if t != 0] outfile.write(f"\t{pred_seq}") outfile.write("\n") outfile.flush() outfile.close()
#!/usr/bin/env python3 from transformers import EncoderDecoderModel, BertTokenizer model = EncoderDecoderModel.from_pretrained('bert-base-uncased', 'bert-base-uncased') tok = BertTokenizer.from_pretrained('bert-base-uncased') input_ids = tok.encode('Hi it is me.', return_tensors='pt') output = model.generate(input_ids, bos_token_id=tok.pad_token_id) print(tok.decode(output[0], skip_special_tokens=True)) import ipdb ipdb.set_trace() pass
print("Show some examples: ") show_answer(tokenizer, test_input_encodings, 0) # show_answer(tokenizer, test_input_encodings, 100) # show_answer(tokenizer, test_input_encodings, 2000) show_answer(tokenizer, test_input_encodings, -1) print("Length of Infer Set: {}".format(len(test_contents))) print("Done Dataset Processing") test_dataset = GenerationDataset(test_input_encodings) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) """Multiple Instance Inference""" device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model = EncoderDecoderModel.from_pretrained(args.modelckpt, output_attentions=True, output_hidden_states=True) model.to(device) model.eval() with torch.no_grad(): fd = open(args.outfile, "w", encoding="utf-8") start = time.time() for step, batch_in in enumerate(test_loader): input_ids = batch_in["input_ids"].to(device) attention_mask = batch_in["attention_mask"].to(device) outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, decoder_start_token_id=tokenizer.cls_token_id, eos_token_id=tokenizer.sep_token_id, num_beams=5, num_return_sequences=1, min_length=3, max_length=15) output_strs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
from general_utils import * from transformers import RobertaTokenizer, EncoderDecoderModel, AutoTokenizer import pickle with open('./config.yaml') as f: configs = yaml.load(f, Loader=yaml.SafeLoader) # Get the checkpoints from gcp os.makedirs(configs['output_dir'] + '/pretrained/', exist_ok=True) os.system('gsutil -m cp -r "{}/*" "{}"'.format( configs['gcp_pretrained_path'], configs['output_dir'] + '/pretrained/')) test_data = get_data_batch(path='./data/test_tokenized/*', test=True) model = EncoderDecoderModel.from_pretrained(configs['output_dir'] + '/pretrained/') model.to("cuda") batch_size = configs['batch_size'] * 2 # change to 64 for full evaluation # map data correctly def generate_summary(batch): # Tokenizer will automatically set [BOS] <text> [EOS] inputs = tokenizer(batch["original"], padding="max_length", truncation=True, max_length=256, return_tensors="pt") input_ids = inputs.input_ids.to("cuda") attention_mask = inputs.attention_mask.to("cuda")
def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self): config = self.get_encoder_decoder_config_small() # create two random BERT models for bert2bert & initialize weights (+cross_attention weights) encoder_pt = BertModel(config.encoder).to(torch_device).eval() decoder_pt = BertLMHeadModel(config.decoder).to(torch_device).eval() encoder_decoder_pt = EncoderDecoderModel( encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval() input_ids = ids_tensor([13, 5], encoder_pt.config.vocab_size) decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size) pt_input_ids = torch.tensor(input_ids.numpy(), device=torch_device, dtype=torch.long) pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(), device=torch_device, dtype=torch.long) logits_pt = encoder_decoder_pt( input_ids=pt_input_ids, decoder_input_ids=pt_decoder_input_ids).logits # PyTorch => TensorFlow with tempfile.TemporaryDirectory( ) as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2: encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1) encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2) encoder_decoder_tf = TFEncoderDecoderModel.from_encoder_decoder_pretrained( tmp_dirname_1, tmp_dirname_2, encoder_from_pt=True, decoder_from_pt=True) logits_tf = encoder_decoder_tf( input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits max_diff = np.max( np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3) # Make sure `from_pretrained` following `save_pretrained` work and give the same result with tempfile.TemporaryDirectory() as tmp_dirname: encoder_decoder_tf.save_pretrained(tmp_dirname) encoder_decoder_tf = TFEncoderDecoderModel.from_pretrained( tmp_dirname) logits_tf_2 = encoder_decoder_tf( input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3) # TensorFlow => PyTorch with tempfile.TemporaryDirectory() as tmp_dirname: encoder_decoder_tf.save_pretrained(tmp_dirname) encoder_decoder_pt = EncoderDecoderModel.from_pretrained( tmp_dirname, from_tf=True) max_diff = np.max( np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3)
def main(args): print(args) check_args(args) if USE_GPU: float_dtype = torch.cuda.FloatTensor long_dtype = torch.cuda.LongTensor else: float_dtype = torch.FloatTensor long_dtype = torch.LongTensor tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased-itokens") # add_tokens(tokenizer) vocab, train_loader, val_loader = build_loaders(args, tokenizer) model_kwargs = {} encoder_decoder_config = EncoderDecoderConfig.from_pretrained( "bert-base-uncased-itokens") model = EncoderDecoderModel.from_pretrained("bert-base-uncased-itokens", config=encoder_decoder_config) # modify_network(model, tokenizer) # model, model_kwargs = build_model(args, vocab) # model.type(float_dtype) model.cuda() print(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) obj_discriminator, d_obj_kwargs = build_obj_discriminator(args, vocab) img_discriminator, d_img_kwargs = build_img_discriminator(args, vocab) gan_g_loss, gan_d_loss = get_gan_losses(args.gan_loss_type) if obj_discriminator is not None: obj_discriminator.type(float_dtype) obj_discriminator.train() print(obj_discriminator) optimizer_d_obj = torch.optim.Adam(obj_discriminator.parameters(), lr=args.learning_rate) if img_discriminator is not None: img_discriminator.type(float_dtype) img_discriminator.train() print(img_discriminator) optimizer_d_img = torch.optim.Adam(img_discriminator.parameters(), lr=args.learning_rate) restore_path = None if args.restore_from_checkpoint: restore_path = '%s_with_model.pt' % args.checkpoint_name restore_path = os.path.join(args.output_dir, restore_path) if restore_path is not None and os.path.isfile(restore_path): print('Restoring from checkpoint:') print(restore_path) checkpoint = torch.load(restore_path) model.load_state_dict(checkpoint['model_state']) optimizer.load_state_dict(checkpoint['optim_state']) if obj_discriminator is not None: obj_discriminator.load_state_dict(checkpoint['d_obj_state']) optimizer_d_obj.load_state_dict(checkpoint['d_obj_optim_state']) if img_discriminator is not None: img_discriminator.load_state_dict(checkpoint['d_img_state']) optimizer_d_img.load_state_dict(checkpoint['d_img_optim_state']) t = checkpoint['counters']['t'] if 0 <= args.eval_mode_after <= t: model.eval() else: model.train() epoch = checkpoint['counters']['epoch'] else: t, epoch = 0, 0 checkpoint = { 'args': args.__dict__, 'vocab': vocab, 'model_kwargs': model_kwargs, 'd_obj_kwargs': d_obj_kwargs, 'd_img_kwargs': d_img_kwargs, 'losses_ts': [], 'losses': defaultdict(list), 'd_losses': defaultdict(list), 'checkpoint_ts': [], 'train_batch_data': [], 'train_samples': [], 'train_iou': [], 'val_batch_data': [], 'val_samples': [], 'val_losses': defaultdict(list), 'val_iou': [], 'norm_d': [], 'norm_g': [], 'counters': { 't': None, 'epoch': None, }, 'model_state': None, 'model_best_state': None, 'optim_state': None, 'd_obj_state': None, 'd_obj_best_state': None, 'd_obj_optim_state': None, 'd_img_state': None, 'd_img_best_state': None, 'd_img_optim_state': None, 'best_t': [], } while True: if t >= args.num_iterations: break epoch += 1 print('Starting epoch %d' % epoch) for batch in train_loader: print(batch) exit() if t == args.eval_mode_after: print('switching to eval mode') model.eval() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) t += 1 if USE_GPU: for k in batch.keys(): batch[k] = batch[k].cuda().long() masks = None with timeit('forward', args.timing): output = model(**batch) # with timeit('loss', args.timing): # # Skip the pixel loss if using GT boxes # skip_pixel_loss = False # total_loss, losses = calculate_model_losses( # args, skip_pixel_loss, model, imgs, imgs_pred) # if img_discriminator is not None: # scores_fake = img_discriminator(imgs_pred) # weight = args.discriminator_loss_weight * args.d_img_weight # total_loss = add_loss(total_loss, gan_g_loss(scores_fake), losses, # 'g_gan_img_loss', weight) losses = {} total_loss = output["loss"] losses['total_loss'] = total_loss.item() if not math.isfinite(losses['total_loss']): print('WARNING: Got loss = NaN, not backpropping') continue optimizer.zero_grad() with timeit('backward', args.timing): total_loss.backward() optimizer.step() total_loss_d = None ac_loss_real = None ac_loss_fake = None d_losses = {} # if img_discriminator is not None: # d_img_losses = LossManager() # imgs_fake = imgs_pred.detach() # scores_fake = img_discriminator(imgs_fake) # scores_real = img_discriminator(imgs) # d_img_gan_loss = gan_d_loss(scores_real, scores_fake) # d_img_losses.add_loss(d_img_gan_loss, 'd_img_gan_loss') # optimizer_d_img.zero_grad() # d_img_losses.total_loss.backward() # optimizer_d_img.step() if t % args.print_every == 0: print('t = %d / %d' % (t, args.num_iterations)) for name, val in losses.items(): print(' G [%s]: %.4f' % (name, val)) checkpoint['losses'][name].append(val) checkpoint['losses_ts'].append(t) # if img_discriminator is not None: # for name, val in d_img_losses.items(): # print(' D_img [%s]: %.4f' % (name, val)) # checkpoint['d_losses'][name].append(val) if t % args.checkpoint_every == 0: print('checking on train') train_results = check_model(args, t, train_loader, model) t_losses = train_results[0] print('checking on val') val_results = check_model(args, t, val_loader, model) val_losses = val_results[0] for k, v in val_losses.items(): checkpoint['val_losses'][k].append(v) checkpoint['model_state'] = model.state_dict() if obj_discriminator is not None: checkpoint['d_obj_state'] = obj_discriminator.state_dict() checkpoint[ 'd_obj_optim_state'] = optimizer_d_obj.state_dict() if img_discriminator is not None: checkpoint['d_img_state'] = img_discriminator.state_dict() checkpoint[ 'd_img_optim_state'] = optimizer_d_img.state_dict() checkpoint['optim_state'] = optimizer.state_dict() checkpoint['counters']['t'] = t checkpoint['counters']['epoch'] = epoch checkpoint_path = os.path.join( args.output_dir, '%s_with_model.pt' % args.checkpoint_name) print('Saving checkpoint to ', checkpoint_path) torch.save(checkpoint, checkpoint_path) # Save another checkpoint without any model or optim state checkpoint_path = os.path.join( args.output_dir, '%s_no_model.pt' % args.checkpoint_name) key_blacklist = [ 'model_state', 'optim_state', 'model_best_state', 'd_obj_state', 'd_obj_optim_state', 'd_obj_best_state', 'd_img_state', 'd_img_optim_state', 'd_img_best_state' ] small_checkpoint = {} for k, v in checkpoint.items(): if k not in key_blacklist: small_checkpoint[k] = v torch.save(small_checkpoint, checkpoint_path)
with open('./config.yaml') as f: configs = yaml.load(f, Loader=yaml.SafeLoader) train_data_batch = get_data_batch(path='./data/train_tokenized/*', batch_size=configs['batch_size']) val_data_batch = get_data_batch(path='./data/val_tokenized/*', batch_size=configs['batch_size']) if configs['load_pretrained']: os.makedirs(configs['output_dir'] + '/pretrained/', exist_ok=True) os.system('gsutil -m cp -r "{}/*" "{}"'.format( configs['gcp_pretrained_path'], configs['output_dir'] + '/pretrained/')) try: roberta_shared = EncoderDecoderModel.from_pretrained( configs['output_dir'] + '/pretrained/', tie_encoder_decoder=True) except: print( 'Warning: There is no pretrained model in the provided link. Initializing a new model weights.' ) roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained( "vinai/phobert-base", "vinai/phobert-base", tie_encoder_decoder=True) else: roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained( "vinai/phobert-base", "vinai/phobert-base", tie_encoder_decoder=True) # set special tokens roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id
from transformers import BertTokenizer, EncoderDecoderModel import os import streamlit as st st.header('Rangkuman Cerpen') st.text('powered by BERT') os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' tokenizer = BertTokenizer.from_pretrained( "cahya/bert2bert-indonesian-summarization") tokenizer.bos_token = tokenizer.cls_token tokenizer.eos_token = tokenizer.sep_token model = EncoderDecoderModel.from_pretrained( "cahya/bert2bert-indonesian-summarization") # ARTICLE_TO_SUMMARIZE = st.text_area( "Masukkan cerpen yang ingin diringkas (max 512 token)") # generate summary input_ids = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors='pt') summary_ids = model.generate(input_ids, min_length=20, max_length=80, num_beams=10, repetition_penalty=2.5, length_penalty=1.0, early_stopping=True, no_repeat_ngram_size=2, use_cache=True,
# # decode the output # top_k_top_p_pred_decode_str = tokenizer.batch_decode(top_k_top_p, skip_special_tokens=True) # batch["top_k_top_p_pred_decode_str"] = top_k_top_p_pred_decode_str # print("top_k_top_p_pred_decode_str: ", top_k_top_p_pred_decode_str) # label str for rouge label_str = [ " ".join(map(str, label_id)) for label_id in labels.input_ids ] batch["label_id_str"] = label_str label_decode_str = tokenizer.batch_decode(labels.input_ids, skip_special_tokens=True) print("label_decode_str: ", label_decode_str) return batch tokenizer = BertTokenizer.from_pretrained(DEFAULT_MODEL_NAME) model = EncoderDecoderModel.from_pretrained("ckpt/checkpoint-2800") # model.to("cuda") lcsts = LCSTS(args.training_path, args.val_path, args.test_path, output_path=args.preprocess_output_path) test_dataset = load_dataset('csv', data_files=[lcsts.test_merged_csv ])['train'] pred_str_keys = [ "greedy_pred_str", "beam_output_pred_str", "beam_output_ngram_pred_str", "top_k_only_ngram_pred_str", "top_p_only_ngram_pred_str", "top_k_top_p_ngram_pred_str" ] results = test_dataset.map(generate_summary,
import torch from transformers import BertTokenizer, EncoderDecoderModel input_str = '1999 chevillon nuit saints georges villages france' encoder_max_length = 128 device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') ed_model = EncoderDecoderModel.from_pretrained('./checkpoint-500') ed_model.to(device) inputs = tokenizer(input_str, padding='max_length', truncation=True, max_length=encoder_max_length, return_tensors='pt') input_ids = inputs.input_ids.to(device) attention_mask = inputs.attention_mask.to(device) outputs = ed_model.generate(input_ids, attention_mask=attention_mask) output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] print(f'NAME\n{input_str}') print() print(f'DESCRIPTION\n{output_str}')
import transformers from transformers import EncoderDecoderModel, AutoTokenizer from tokenizers import Tokenizer import torch import sys from datasets import load_metric metric = load_metric('sacrebleu') chk_dir = sys.argv[1] chk_num = sys.argv[2] num_beams = int(sys.argv[3]) code_tok = False if sys.argv[4] == 'false' else True print_bool = False if sys.argv[5] == 'false' else True model = EncoderDecoderModel.from_pretrained('./{}/checkpoint-{}/'.format( chk_dir, chk_num)) code_tokenizer = Tokenizer.from_file( 'code_tokenizer.json') if code_tok else AutoTokenizer.from_pretrained( 'bert-base-uncased') text_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #print(text_tokenizer.convert_tokens_to_ids()) pad_token_id = 1 if code_tok else 0 bos_token_id = 2 if code_tok else 101 eos_token_id = 3 if code_tok else 102 f = open('tok-eval.tsv', 'r') for i, line in enumerate(f): if i == 0: continue if i > 1000: break
def __init__(self) -> None: self.lists = {} # M-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') self.bert_multilingual_model = BertForMaskedLM.from_pretrained( 'bert-base-multilingual-cased').eval() self.lists["M-BERT"] = { "Tokenizer": self.bert_multilingual_tokenizer, "Model": self.bert_multilingual_model } print("====================================") print("[BERT] Google Multilingual BERT loaded") print("====================================") # KR-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.krbert_tokenizer = BertTokenizerFast.from_pretrained( 'snunlp/KR-Medium') self.krbert_model = BertForMaskedLM.from_pretrained( 'snunlp/KR-Medium').eval() self.lists["KR-Medium"] = { "Tokenizer": self.krbert_tokenizer, "Model": self.krbert_model } print("====================================") print("[BERT] KR-BERT loaded") print("====================================") # BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_kor_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.lists["bert-kor-base"] = { "Tokenizer": self.bert_kor_tokenizer, "Model": self.bert_kor_model } print("====================================") print("[BERT] BERT-kor-base loaded") print("====================================") # ALBERT from transformers import AlbertForMaskedLM self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.lists["albert-kor-base"] = { "Tokenizer": self.albert_tokenizer, "Model": self.albert_model } print("====================================") print("[BERT] ALBERT-kor-base loaded") print("====================================") # XLM-Roberta from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained( 'xlm-roberta-base') self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() self.lists["xlm-roberta-base"] = { "Tokenizer": self.xlmroberta_tokenizer, "Model": self.xlmroberta_model } print("====================================") print("[BERT] XLM-Roberta-kor loaded") print("====================================") from transformers import BertTokenizerFast, EncoderDecoderModel self.tokenizer_bertshared = BertTokenizerFast.from_pretrained( "kykim/bertshared-kor-base") self.bertshared_model = EncoderDecoderModel.from_pretrained( "kykim/bertshared-kor-base") self.lists["bertshared-kor-base"] = { "Tokenizer": self.tokenizer_bertshared, "Model": self.bertshared_model } print("====================================") print("[Seq2seq + BERT] bertshared-kor-base loaded") print("====================================") # gpt3-kor-small_based_on_gpt2 from transformers import BertTokenizerFast, GPT2LMHeadModel self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.model_gpt3 = GPT2LMHeadModel.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.lists["gpt3-kor-small_based_on_gpt2"] = { "Tokenizer": self.tokenizer_gpt3, "Model": self.model_gpt3 } print("====================================") print("[GPT3] gpt3-small-based-on-gpt2 loaded") print("====================================") # electra-base-kor from transformers import ElectraTokenizerFast, ElectraModel self.tokenizer_electra = ElectraTokenizerFast.from_pretrained( "kykim/electra-kor-base") self.electra_model = ElectraModel.from_pretrained( "kykim/electra-kor-base") self.lists["electra-kor-base"] = { "Tokenizer": self.tokenizer_electra, "Model": self.electra_model } print("====================================") print("[ELECTRA] electra-kor-base loaded") print("====================================") from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.lists["electra-kor-QA"] = { "Tokenizer": self.electra_tokenizer_QA, "Model": self.electra_model_QA } print("====================================") print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded") print("====================================")
r for r in tqdm.tqdm( ria_reader_with_date_approx( '/home/aobuhtijarov/datasets/ria/ria.shuffled.val.json')) ]) ria_records.extend([ r for r in tqdm.tqdm( ria_reader_with_date_approx( '/home/aobuhtijarov/datasets/ria/ria.shuffled.test.json')) ]) lenta_records = [ r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014'] ] model = EncoderDecoderModel.from_pretrained(clust_model) tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=False, do_basic_tokenize=False) setattr(tokenizer, 'max_tokens_text', 250) model.cuda() text_to_vector_func = get_text_to_vector_func('bert-FirstCLS', model, tokenizer) lenta_embeds = get_embeds_for_records(lenta_records, text_to_vector_func) ria_embeds = get_embeds_for_records(ria_records, text_to_vector_func) def f(start, total, n_jobs): print(start, total, n_jobs)
def __init__(self, model_name, device): self.device = device self.tokenizer = BertTokenizerFast.from_pretrained(model_name) self.model = EncoderDecoderModel.from_pretrained(model_name) self.model.device(device)
NL2CST_PORT = get_env('NL2CST_PORT', 8080) NUM_BEAMS = int(get_env("NUM_BEAMS")) NUM_OUTPUTS = int(get_env("NUM_OUTPUTS")) TYPE_FORCING = int(get_env("TYPE_FORCING")) BATCH_SIZE = int(get_env("BATCH_SIZE")) CLEAN_INPUT = int(get_env("CLEAN_INPUT")) app = Flask(__name__) dataset_properties = json.load( open(os.path.join(MODEL_ROOT, "dataset_properties.json"))) target_vocab = dataset_properties["target_vocab"] special_tokens = dataset_properties["special_tokens"] max_input_length = dataset_properties["encoder_max_len"] bert2arsenal = EncoderDecoderModel.from_pretrained( get_last_checkpoint(MODEL_ROOT)) tokenizer_path = os.path.join(MODEL_ROOT, "source_tokenizer") # Try to use saved source tokenizer from file to prevent any downloads. # Our older trained models didn't save the source tokenizer to disk, so use # the download method as a fallback to remain compatible with older models. if os.path.exists(tokenizer_path): source_tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path) else: print(f"no existing source tokenizer found, downloading...") source_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") source_tokenizer.add_special_tokens( {"additional_special_tokens": special_tokens}) target_tokenizer = PreTrainedArsenalTokenizer(target_vocab=target_vocab) type_forcing_vocab = target_tokenizer.id2vocab if TYPE_FORCING else None
def generate_summaries_or_translations( examples: List[str], out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, task="summarization", prefix=None, **generate_kwargs, ) -> Dict: """Save model.generate results to <out_file>, and return how long it took.""" fout = Path(out_file).open("w", encoding="utf-8") model_name = str(model_name) if "encoder" in model_name and "decoder" in model_name: model = EncoderDecoderModel.from_pretrained(model_name).to(device) else: model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) if fp16: model = model.half() if "encoder" in model_name and "decoder" in model_name: tokenizer = AutoTokenizer.from_pretrained(model_name, config=model.config) else: tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"Inferred tokenizer type: {tokenizer.__class__}" ) # if this is wrong, check config.model_type. decoder_start_token_id = None # default to config if isinstance(model.config, EncoderDecoderConfig): decoder_start_token_id = model.config.decoder.pad_token_id start_time = time.time() # update config with task specific params use_task_specific_params(model, task) if prefix is None: # prefix = prefix or getattr(model.config, "prefix", "") or "" prefix = "" for examples_chunk in tqdm(list(chunks(examples, batch_size))): examples_chunk = [prefix + text for text in examples_chunk] batch = tokenizer(examples_chunk, return_tensors="pt", truncation=True, padding="longest").to(device) summaries = model.generate( input_ids=batch.input_ids, attention_mask=batch.attention_mask, decoder_start_token_id=decoder_start_token_id, **generate_kwargs, ) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush() fout.close() runtime = int(time.time() - start_time) # seconds n_obs = len(examples) return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
for arg in vars(args): print("{}: {}".format(arg, getattr(args, arg))) return args if __name__ == "__main__": print("Start Cross Attention Distribution Visualization", flush=True) args = args_parse() device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") tokenizer.add_special_tokens({"additional_special_tokens": ["[unused1]"]}) model = EncoderDecoderModel.from_pretrained(args.modelname, output_attentions=True) model.encoder.config.output_attentions = True model.decoder.config.output_attentions = True model.to(device) model.eval() train_contents, train_querys, = get_dataset(args.contentfile, args.queryfile) print("Get Zeroth Sample", flush=True) print("Content: {}".format(train_contents[0]), flush=True) print("Question (Query): {}".format(train_querys[0]), flush=True) data_iterator = JustInTime_InOrder_Iterator(train_contents, train_querys, batch_size=args.batch_size)
from transformers import EncoderDecoderModel, BertTokenizer import torch from tqdm import tqdm tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') model = EncoderDecoderModel.from_pretrained("checkpoint-100000") with open("src_train.txt", 'r') as f, open("result.txt", 'w') as f2: for line in tqdm(f): input_ids = torch.tensor(tokenizer.encode(line)).unsqueeze(0) generated = model.generate( input_ids, decoder_start_token_id=model.config.decoder.pad_token_id) f2.write(str(tokenizer.decode(generated[0])) + "\n")
def train_gen_title(run_name: str, config_file: str, train_file: str, train_fraq: float, output_model_path: str, from_pretrained: str = None, checkpoint: str = None): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] full_dataset = LentaRiaDataset(train_file, tokenizer, max_tokens_text, max_tokens_title) print("Initializing model...") if from_pretrained: model = EncoderDecoderModel.from_pretrained(from_pretrained) else: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = EncoderDecoderModel.from_encoder_decoder_pretrained( enc_model_path, dec_model_path) train_size = int(train_fraq * len(full_dataset)) train_dataset, val_dataset = \ torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size]) wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Val dataset size': len(val_dataset), }) print("Training model...") batch_size = config["batch_size"] eval_steps = config["eval_steps"] save_steps = config["save_steps"] logging_steps = config["logging_steps"] enc_lr = config["enc_lr"] dec_lr = config["dec_lr"] warmup_steps = config["num_warmup_steps"] max_steps = config["max_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, save_total_limit=1, max_steps=max_steps, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, optimizers=opt, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
from transformers import BertTokenizerFast from transformers import EncoderDecoderModel from datasets import load_metric, load_from_disk bert2bert = EncoderDecoderModel.from_pretrained("./checkpoint-20").to("cuda") #bert2bert = BertTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail") bert2bert.half() tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") cnndm = load_from_disk("dataset/gigaword") test_data = cnndm['test'] rouge = load_metric("rouge") def generate_summary(batch): # cut off at BERT max length 64 inputs = tokenizer(batch["document"], padding="max_length", truncation=True, max_length=32, return_tensors="pt") input_ids = inputs.input_ids.to("cuda") attention_mask = inputs.attention_mask.to("cuda") outputs = bert2bert.generate(input_ids, attention_mask=attention_mask) output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True) batch["pred_summary"] = output_str return batch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Device:", device) model_created = False if args.checkpoint != None: model_created = True if args.bart: config = BartConfig.from_json_file(args.checkpoint + "/config.json") model = BartForConditionalGeneration.from_pretrained( args.checkpoint + "/pytorch_model.bin", config=config) else: config = EncoderDecoderConfig.from_json_file(args.checkpoint + "/config.json") model = EncoderDecoderModel.from_pretrained(args.checkpoint + "/pytorch_model.bin", config=config) if args.language == 'fr': if args.bart: model_name = "WikinewsSum/bart-large-multi-fr-wiki-news" #config = BartConfig.from_pretrained(model_name) tokenizer = BartTokenizer.from_pretrained(model_name) if not model_created: model = BartForConditionalGeneration.from_pretrained( model_name) model_created = True else: model_name = "camembert-base" #config = CamembertConfig.from_pretrained(model_name) tokenizer = CamembertTokenizer.from_pretrained(model_name)
def get_encoderdecoder_model(self): return EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
def train_style_gen_title( run_name: str, config_file: str, train_file: str, dataset_type: str, output_model_path: str, from_pretrained: str = None, checkpoint: str = None ): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) agency_list = config['agency_list'] print('Agency list:', agency_list) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] print("Initializing model...") if from_pretrained: model = EncoderDecoderModel.from_pretrained(from_pretrained) else: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = EncoderDecoderModel.from_encoder_decoder_pretrained(enc_model_path, dec_model_path) print("Fetching data...") if dataset_type == 'tg': all_records = [r for r in tqdm.tqdm(tg_reader(train_file))] elif dataset_type == 'lenta-ria': lenta_records = [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))] lenta_records.extend( [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))] ) ria_records = [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.train.json')))] ria_records.extend( [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.val.json')))] ) random.shuffle(ria_records) all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \ ria_records[:220000] random.shuffle(all_records) print("Building datasets...") agency_to_special_token_id = {a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)} full_dataset = AgencyTitleDatasetGeneration( all_records, tokenizer, filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title ) train_size = int(0.93 * len(full_dataset)) train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size]) print(f"Train dataset length = {len(train_dataset)}\nVal dataset length = {len(val_dataset)}") wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Test dataset size': len(val_dataset), }) print("Training model...") batch_size = config["batch_size"] eval_steps = config["eval_steps"] save_steps = config["save_steps"] logging_steps = config["logging_steps"] enc_lr = config["enc_lr"] dec_lr = config["dec_lr"] warmup_steps = config["num_warmup_steps"] max_steps = config["max_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, save_total_limit=2, max_steps=max_steps, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, optimizers=opt, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
def create_and_check_encoder_decoder_shared_weights( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, labels, **kwargs): torch.manual_seed(0) encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) model.to(torch_device) model.eval() # load state dict copies weights but does not tie them decoder_state_dict = model.decoder._modules[ model.decoder.base_model_prefix].state_dict() model.encoder.load_state_dict(decoder_state_dict, strict=False) torch.manual_seed(0) tied_encoder_model, tied_decoder_model = self.get_encoder_decoder_model( config, decoder_config) config = EncoderDecoderConfig.from_encoder_decoder_configs( tied_encoder_model.config, tied_decoder_model.config, tie_encoder_decoder=True) tied_model = EncoderDecoderModel(encoder=tied_encoder_model, decoder=tied_decoder_model, config=config) tied_model.to(torch_device) tied_model.eval() model_result = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) tied_model_result = tied_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) # check that models has less parameters self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())) random_slice_idx = ids_tensor((1, ), model_result[0].shape[-1]).item() # check that outputs are equal self.assertTrue( torch.allclose(model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4)) # check that outputs after saving and loading are equal with tempfile.TemporaryDirectory() as tmpdirname: tied_model.save_pretrained(tmpdirname) tied_model = EncoderDecoderModel.from_pretrained(tmpdirname) tied_model.to(torch_device) tied_model.eval() # check that models has less parameters self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())) random_slice_idx = ids_tensor((1, ), model_result[0].shape[-1]).item() tied_model_result = tied_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) # check that outputs are equal self.assertTrue( torch.allclose(model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4))
def evaluate_style_gen_title( existing_run_name: str, existing_run_id: str, config_file: str, gen_model_file: str, discr_model_file: str, test_file: str, test_sample_rate: float, ): logging.set_verbosity_info() init_wandb(existing_run_name, None, existing_run_id) config = json.loads(jsonnet_evaluate_file(config_file)) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] setattr(tokenizer, 'max_tokens_text', max_tokens_text) batch_size = config["batch_size"] print("Loading model...") model = EncoderDecoderModel.from_pretrained(gen_model_file) model.eval() model.cuda() agency_list = config['agency_list'] discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda() print("Fetching TG data...") test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) if random.random() <= test_sample_rate] print("Building datasets...") agency_to_special_token_id = { a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list) } agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))} test_dataset = AgencyTitleDatasetGeneration( test_records, tokenizer, filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title ) print('Dataset size:', len(test_dataset)) y_pred = [] y_true = [] for i in tqdm.trange(0, len(test_dataset), batch_size): data = test_dataset[i] for k in tuple(data.keys()): if k not in ('input_ids', 'attention_mask'): del data[k] else: data[k] = data[k].unsqueeze(0) for j in range(i + 1, min(i + batch_size, len(test_dataset))): for k in data.keys(): data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0) y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']] for j in range(i, min(i + batch_size, len(test_dataset)))]) data['input_ids'] = data['input_ids'].cuda() data['attention_mask'] = data['attention_mask'].cuda() output_ids = model.generate( **data, decoder_start_token_id=model.config.decoder.pad_token_id, min_length=7, max_length=20, num_beams=6 ) preds = [ tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids ] for title in preds: inp = tokenizer(title, add_special_tokens=True, max_length=max_tokens_title, padding='max_length', truncation=True ) logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0] y_pred.append(torch.argmax(logits).item()) wandb.summary.update({ 'D-Style': classification_report(y_true, y_pred, output_dict=True) })