def loadAbstractSummarizer(): from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig from transformers import LEDForConditionalGeneration, LEDTokenizer model_name = get_item("abstract_summarizer_model_name") MODEL_DIRECOTRY = f'./models/{model_name}/' use_bart = "bart" in model_name if os.path.exists(MODEL_DIRECOTRY): if use_bart: model = BartForConditionalGeneration.from_pretrained( MODEL_DIRECOTRY) tokenizer = BartTokenizer.from_pretrained(MODEL_DIRECOTRY) else: model = LEDForConditionalGeneration.from_pretrained( MODEL_DIRECOTRY, return_dict_in_generate=True) tokenizer = LEDTokenizer.from_pretrained(MODEL_DIRECOTRY) else: if use_bart: model = BartForConditionalGeneration.from_pretrained(model_name) tokenizer = BartTokenizer.from_pretrained(model_name) else: model = LEDForConditionalGeneration.from_pretrained( model_name, return_dict_in_generate=True) tokenizer = LEDTokenizer.from_pretrained(model_name) model.save_pretrained(MODEL_DIRECOTRY) tokenizer.save_pretrained(MODEL_DIRECOTRY) return model, tokenizer
def get_model_tokenizer(model_name): import torch torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' if "pegasus" in model_name: #its a pegasus model from transformers import PegasusForConditionalGeneration, PegasusTokenizer tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer elif "bart-large" in model_name: # its a bart-model from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig tokenizer = BartTokenizer.from_pretrained(model_name) model = BartForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer elif "bart-custom-large" in model_name: from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig tokenizer = BartTokenizer.from_pretrained(model_name) model = BartForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer else: # T5 or distilbart from transformers import AutoTokenizer, AutoModelWithLMHead tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelWithLMHead.from_pretrained(model_name).to( torch_device) return model, tokenizer
def __init__(self, device=None, checkpoint=None, state_dict_key='model', pretrained="facebook/bart-large-cnn", hg_transformers=True): if not hg_transformers and checkpoint: raise Exception( "hg_transformers must be set to True in order to load from checkpoint" ) if not device: device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # huggingface uses dashes and fairseq/torchhub uses dots (periods) if pretrained: if hg_transformers: pretrained = pretrained.replace(".", "-") else: # only use the part after the "/" pretrained = pretrained.split("/")[-1].replace("-", ".") if checkpoint != None and "semsim" in checkpoint: cache_dir = appdirs.user_cache_dir("DocSum", "HHousen") output_file_path = os.path.join(cache_dir, "bart_semsim.pt") if not os.path.isfile(output_file_path): if not os.path.exists(cache_dir): os.makedirs(cache_dir) gdown.download( "https://drive.google.com/uc?id=1CNgK6ZkaqUD239h_6GkLmfUOGgryc2v9", output_file_path) checkpoint = output_file_path if checkpoint: loaded_checkpoint = torch.load(checkpoint) model_state_dict = loaded_checkpoint[state_dict_key] bart = BartForConditionalGeneration.from_pretrained( pretrained, state_dict=model_state_dict) tokenizer = BartTokenizer.from_pretrained( pretrained, state_dict=model_state_dict) self.tokenizer = tokenizer else: if hg_transformers: bart = BartForConditionalGeneration.from_pretrained(pretrained) tokenizer = BartTokenizer.from_pretrained(pretrained) self.tokenizer = tokenizer else: bart = torch.hub.load('pytorch/fairseq', pretrained) bart.to(device) bart.eval() bart.half() self.logger = logging.getLogger(__name__) self.hg_transformers = hg_transformers self.bart = bart
def launch_bart(): tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') config = BartConfig.from_pretrained('facebook/bart-large-cnn') model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large-cnn', num_labels=len(tags_vals)) model_path = args.save + 'bart_trained.pt' ## ---------12 . Optimizer -> weight regularization is a solution to reduce the overfitting of a deep learning """ Last keras optimization 2020 (rates from 0.01 seem to be best hyperparamater )for weight regularization for weights layers from keras.layers import LSTM from keras.regularizers import l2 model.add(LSTM(32, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))) Note : BERT not include beta an gamma parametres for optimization """ FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer] }] optimizer = Adam(optimizer_grouped_parameters, lr=args.lr) launch_training(training_path=args.training_data, training_epochs=4, valid_path=args.validate_data, training_batch_size=1, model=model, model_path=model_path, tokenizer=tokenizer, optimizer=optimizer) print(model_path) model = BartForConditionalGeneration.from_pretrained(args.save) launch_test_without_label(test_path=args.test_data, model=model, tokenizer=tokenizer)
def load_hf_model(config, pretrained=False, path=None): if pretrained: if path: model = BartForConditionalGeneration.from_pretrained( "facebook/bart-large-cnn", state_dict=torch.load(path, map_location=torch.device('cuda')), config=config) else: model = BartForConditionalGeneration.from_pretrained( "facebook/bart-large-cnn", config=config) else: model = BartForConditionalGeneration() return model
def test_diverse_beam_search(self): article = """Justin Timberlake and Jessica Biel, welcome to parenthood. The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People. "Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first," People reports. The couple announced the pregnancy in January, with an Instagram post. It is the first baby for both.""" bart_tokenizer = BartTokenizer.from_pretrained( "facebook/bart-large-cnn") bart_model = BartForConditionalGeneration.from_pretrained( "facebook/bart-large-cnn").to(torch_device) input_ids = bart_tokenizer( article, return_tensors="pt").input_ids.to(torch_device) outputs = bart_model.generate(input_ids, num_beams=4, num_return_sequences=2, num_beam_groups=4, diversity_penalty=2.0) generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True) self.assertListEqual( generated_text, [ "The couple announced the birth of their son, Silas Randall Timberlake, in a statement. Silas was the middle name of Timberlake's maternal grandfather Bill Bomar. Randall is the musician's own middle name, as well as his father's first. It is the first baby for both of them.", "Justin Timberlake and Jessica Biel have a son. The baby is named Silas Randall Timberlake. It is the first child for both. The couple announced the pregnancy in January. The name Silas is the middle name of Timberlake's maternal grandfather. It's also his own middle name.", ], )
def test_xsum_summarization_same_as_fairseq(self): model = BartForConditionalGeneration.from_pretrained( "facebook/bart-large-xsum").to(torch_device) self.assertFalse(model.config.is_valid_mbart()) tok = self.default_tokenizer EXPECTED_SUMMARY = "California's largest power company has begun shutting off electricity to thousands of customers in the state." dct = tok.batch_encode_plus( [PGE_ARTICLE], max_length=1024, padding="max_length", truncation=True, return_tensors="pt", ).to(torch_device) hypotheses_batch = model.generate( input_ids=dct["input_ids"], attention_mask=dct["attention_mask"], num_beams=2, max_length=62, min_length=11, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True, decoder_start_token_id=model.config.eos_token_id, ) decoded = tok.batch_decode( hypotheses_batch, skip_special_tokens=True, ) self.assertEqual(EXPECTED_SUMMARY, decoded[0])
def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE): fout = Path(out_file).open("w") model = BartForConditionalGeneration.from_pretrained( "bart-large-cnn", output_past=True, ).to(device) tokenizer = BartTokenizer.from_pretrained("bart-large") for batch in tqdm(list(chunks(lns, batch_size))): dct = tokenizer.batch_encode_plus(batch, max_length=1024, return_tensors="pt", pad_to_max_length=True) summaries = model.generate( input_ids=dct["input_ids"].to(device), attention_mask=dct["attention_mask"].to(device), num_beams=4, length_penalty=2.0, max_length= 142, # +2 from original because we start at step=1 and stop before max_length min_length=56, # +1 from original because we start at step=1 no_repeat_ngram_size=3, early_stopping=True, do_sample=False, ) dec = [ tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries ] for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush()
def model(self): """Only load the model if needed.""" if self._model is None: model = BartForConditionalGeneration.from_pretrained( "mbart-large-en-ro") self._model = model.to(torch_device) return self._model
def __init__(self, hparams, user_tokens=['<newline>', '<bullet>', '<sep>']): super(BartSystem, self).__init__() self.hparams = hparams self.hparams.model_type = self.hparams.model_type.lower() tokenizer = BartTokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, do_lower_case=self.hparams.do_lower_case, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, ) config = AutoConfig.from_pretrained( self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, output_past=self.hparams.do_test, vocab_size=len(tokenizer)) model = BartForConditionalGeneration.from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=config, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, ) self.config, self.tokenizer, self.model = config, tokenizer, model self.loss = [] # for keeping track of average loss self.metrics = {} self.vocab = {v: k for k, v in self.tokenizer.get_vocab().items()}
def load_hf_model(config, pretrained=True, path=None): if pretrained: if path: model = BartForConditionalGeneration.from_pretrained( "bart-large-cnn", state_dict=torch.load(path, map_location=torch.device( settings.DEVICE)), config=config) else: model = BartForConditionalGeneration.from_pretrained( "bart-large-cnn", config=config) else: model = BartForConditionalGeneration() return model.to(settings.DEVICE)
def bart_summarize(input_file): model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large-cnn') tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') num_count = get_num_pages(input_file) f = open('summarized_bart.txt', 'a+') count = 0 while count < num_count: text = pdf_to_text(input_file, count) ARTICLE_TO_SUMMARIZE = text inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') # Generate Summary summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) summarized_text = [ tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids ] print(summarized_text) str1 = ''.join(summarized_text) print(str1) f.write(str1) count += 1 f.close()
def __init__(self, sum_method): super().__init__() self.summarization_methods = { 'simple': self.simple_summarizer, 'GPT2': self.gpt2_summarizer, 'xlnet': self.xlnet_summarizer, 'bart': self.bart_summarizer, 't5': self.t5_summarizer } # Not using dictionary to avoid creating models that are not going to be used if sum_method == 'GPT2': self.model = TransformerSummarizer( transformer_type="GPT2", transformer_model_key="gpt2-medium") if sum_method == 'xlnet': self.model = TransformerSummarizer( transformer_type="XLNet", transformer_model_key="xlnet-base-cased") if sum_method == 'bart': self.model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large-cnn') if sum_method == 't5': self.model = T5ForConditionalGeneration.from_pretrained('t5-small') self.sum_method_name = sum_method self.summarization_method = self.summarization_methods[sum_method]
def main(): print("initializing bart tokenizer...") tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") print("creating lightseq model...") ls_model = lsi.Transformer("lightseq_bart_base.pb", 128) print("creating huggingface model...") hf_model = BartForConditionalGeneration.from_pretrained( "facebook/bart-base") # GPU warm up sentences = [" ".join(["I"] * 10)] * 8 inputs = tokenizer(sentences, return_tensors="pt", padding=True) inputs_id = inputs["input_ids"] _, _ = ls_bart(ls_model, inputs_id) _, _ = hf_bart(hf_model, inputs_id) bsz_list = [1, 2, 4, 8, 16, 32, 64, 128] seq_len_list = [1, 2, 4, 8, 16, 32] for bsz in bsz_list: total_ls = 0.0 total_hf = 0.0 for seq_len in seq_len_list: sentences = [" ".join(["I"] * seq_len)] * bsz inputs = tokenizer(sentences, return_tensors="pt", padding=True) inputs_id = inputs["input_ids"] _, ls_time = ls_bart(ls_model, inputs_id) _, hf_time = hf_bart(hf_model, inputs_id) total_ls += ls_time total_hf += hf_time print(f"{bsz}: {total_hf/total_ls-1}")
def __init__(self, model: str = None): log.info(model) torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") log.info(torch_device) if model is None: model = "t5" self.modelName = model # path to all the files that will be used for inference self.path = f"./app/api/{model}/" self.model_path = self.path + "pytorch_model.bin" self.config_path = self.path + "config.json" # Selecting the correct model based on the passed madel input. Default t5 if model == "t5": self.config = T5Config.from_json_file(self.config_path) self.model = T5ForConditionalGeneration(self.config) self.tokenizer = T5Tokenizer.from_pretrained(self.path) self.model.eval() self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device)) elif model == "google/pegasus-newsroom": self.config = PegasusConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = PegasusTokenizer.from_pretrained(model) elif model == "facebook/bart-large-cnn": self.config = BartConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = BartTokenizer.from_pretrained(model) else: raise Exception("This model is not supported") self.text = str()
def __init__( self, is_eval=False, ): super().__init__() self.model = BartForConditionalGeneration.from_pretrained('bart-large') if is_eval: self.model = self.model.eval() self.criterion = nn.CrossEntropyLoss(ignore_index=config.PAD_idx) if config.use_sgd: self.optimizer = torch.optim.SGD(self.parameters(), lr=config.lr) else: self.optimizer = torch.optim.Adam(self.parameters(), lr=config.lr) if config.USE_CUDA: self.model = self.model.cuda() self.model_dir = config.save_path if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.best_path = ""
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_file") parser.add_argument("--output_file") parser.add_argument( "--decoder", choices=['greedy', 'beam_search', 'random', 'top_k', 'nucleus']) args = parser.parse_args() model_name = 'sshleifer/distilbart-xsum-1-1' model = BartForConditionalGeneration.from_pretrained(model_name).eval() tokenizer = BartTokenizer.from_pretrained(model_name) # Iterate through input file documents, generating summaries outputs = [] for line in tqdm.tqdm(jsonlines.open(args.input_file)): summary, summary_score = generate_summary(model=model, tokenizer=tokenizer, document=line['document'], decoder=args.decoder) outputs.append({ 'id': line['id'], 'generated_summary': summary, 'generated_summary_score': summary_score }) # Write out the generated summaries to file with open(args.output_file, 'w', encoding='utf-8') as f: for l in outputs: f.write(json.dumps(l, ensure_ascii=False) + '\n')
def __init__( self, chkpt_path="/Users/byronwallace/code/RoboSum/weights/pl_title_/pl_title_2048.ckpt" ): self.model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large-cnn') self.config = BartConfig.from_pretrained('facebook/bart-large-cnn') self.tokenizer = BartTokenizer.from_pretrained( 'facebook/bart-large-cnn') # increase position embeddings from 1024 to 2048 self.add_position_embeddings() # now add special tokens (for title and abstract demarcation) # as a general note: we'll assume "abstract" is either the # actual abstract of extracted text from the same (i.e., punchlines) self.add_special_tokens() # now load the checkpoint print("loading checkpoint", chkpt_path) checkpoint = torch.load(chkpt_path, map_location="cpu") print("done") cnew = {} for key, value in checkpoint['state_dict'].items(): cnew[".".join(key.split('.')[1:])] = value self.model.load_state_dict(cnew)
def __init__( self, model_name_or_path, tokenizer_name, model_cache_dir, input_max_length, target_max_length, summary_column_name, document_column_name, wandb_project, wandb_run_name, **kwargs, ): super().__init__( input_max_length, target_max_length, summary_column_name, document_column_name, wandb_project, wandb_run_name, ) self.tokenizer = BartTokenizer.from_pretrained( tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=model_cache_dir, ) self.model = BartForConditionalGeneration.from_pretrained( model_name_or_path, cache_dir=model_cache_dir, )
def train(cli_args: argparse.Namespace) -> None: model = BartForConditionalGeneration.from_pretrained("facebook/bart-base") model.to(device) training_args = TrainingArguments( output_dir="./models/bart-coder", num_train_epochs=cli_args.epochs, per_device_train_batch_size=128, per_device_eval_batch_size=128, warmup_steps=500, weight_decay=0.01, logging_dir="./logs", fp16=True, remove_unused_columns=True, dataloader_num_workers=4, ) print("training on:", training_args.device) trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"], eval_dataset=dataset["validation"], ) trainer.train() trainer.save_model() print(trainer.evaluate())
def main(): parser = argparse.ArgumentParser() parser.add_argument('--user_input', action="store_true") args = parser.parse_args() print("initializing bart tokenizer...") tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") print("creating lightseq model...") ls_model = lightseq.Transformer("lightseq_bart_base.pb", 128) print("creating huggingface model...") hf_model = BartForConditionalGeneration.from_pretrained( "facebook/bart-base") while True: if args.user_input: sentences = [input("input the masked sentence:\n")] else: sentences = [ "I love that girl, but <mask> does not <mask> me.", "She is so <mask> that I can not help glance at <mask>.", "Nothing's gonna <mask> my love for you.", "Drop everything now. Meet me in the pouring <mask>. Kiss me on the sidewalk." ] print("tokenizing the sentences...") inputs = tokenizer(sentences, return_tensors="pt", padding=True) inputs_id = inputs["input_ids"] ls_generate(ls_model, tokenizer, inputs_id) hf_generate(hf_model, tokenizer, inputs_id) if not args.user_input: break
def model(self): """Only load the model if needed.""" model = BartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro").to(torch_device) if "cuda" in torch_device: model = model.half() return model
def setUpClass(cls): # summarization # generate yes beam search # Note for BART summarization in transformers repo, beam search performs much better # than no beam search, but even their beam search with num_beams=1 is better, implying that something # is broken in the _generate_no_beam_search function # see ``examples/summarization/bart/evaluate_cnn.py`` for a longer example cls.model = BartForConditionalGeneration.from_pretrained( 'bart-large-cnn') cls.tokenizer = BartTokenizer.from_pretrained('bart-large-cnn') cls.decoding_hyperparams = {'max_length': 40, 'num_beams': 3} cls.test_news_article_1 = 'New Zealand says it has stopped community transmission of Covid-19, ' \ 'effectively eliminating the virus. With new cases in single figures for several days - one on Sunday ' \ '- Prime Minister Jacinda Ardern said the virus was "currently" eliminated. But officials have warned ' \ 'against complacency, saying it does not mean a total end to new coronavirus cases. ' \ 'The news comes hours before New Zealand is set to move out of its toughest level of social restrictions. ' \ 'From Tuesday, some non-essential business, healthcare and education activity will be able to resume. ' \ 'Most people will still be required to remain at home at all times and avoid all social interactions.' cls.test_news_article_2 = \ 'But officials have warned against complacency, saying it does not mean a total end to new HIV cases. ' \ 'Most people will still be required to remain at home at all times and avoid all social interactions.' \ 'Germany says it has stopped community transmission of HIV, ' \ 'effectively eliminating the virus. With new cases in single figures for several days - one on Sunday ' \ '- Prime Minister Angela Merkle said the virus was "currently" eliminated. ' \ 'From Tuesday, some non-essential business, healthcare and education activity will be able to resume. ' \ 'The news comes hours before Germany is set to move out of its toughest level of social restrictions. '
def __init__(self, datamodule, learning_rate=3e-5, batch_size=8, optimizer='adam', dataset='', pre_trained=''): super(KGQGTuner, self).__init__() if pre_trained == 't5': self.model = T5ForConditionalGeneration.from_pretrained('t5-base') elif pre_trained == 'bart': self.model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-base') else: raise Exception( f'Unknown pre-trained model {pre_trained}, choose t5 or bart.') # resize embedding to account for additional special tokens self.tokenizer = datamodule.tokenizer self.model.resize_token_embeddings(len(self.tokenizer)) self.learning_rate = learning_rate # add batch size to init to enable automatic batch size scaling. self.batch_size = datamodule.batch_size #self.dataset = dataset self.optimizer = optimizer # testing self.bleu_metric = bleu_score self.save_hyperparameters('learning_rate', 'batch_size', 'optimizer', 'dataset', 'pre_trained')
def generate_summaries( examples: list, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE ): fout = Path(out_file).open("w") model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn', output_past=True).to(device) tokenizer = BartTokenizer.from_pretrained("bart-large-cnn") max_length = 140 min_length = 55 for batch in tqdm(list(chunks(examples, batch_size))): dct = tokenizer.batch_encode_plus(batch, max_length=64, return_tensors="pt", pad_to_max_length=True) print(dct["input_ids"][0]) print(dct["attention_mask"][0]) summaries = model.generate( input_ids=dct["input_ids"].to(device), attention_mask=dct["attention_mask"].to(device), num_beams=4, length_penalty=10.0, repetition_penalty = 5.0, max_length=20, # +2 from original because we start at step=1 and stop before max_length #min_length=min_length + 1, # +1 from original because we start at step=1 no_repeat_ngram_size=3, early_stopping=True, # decoder_start_token_id=model.config.eos_token_id, ) dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries] in_ids = dct["input_ids"].to(device) in_dec = [tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in in_ids] for input, hypothesis in zip(in_dec, dec): fout.write(input + ' ||| ' + hypothesis + "\n") fout.flush()
def __init__(self, init='bart.large', shared_training='decoder'): self._model = BARTMultiGPUWrapper(model_name=init) # SSL model self._ssl_model = BartForConditionalGeneration.from_pretrained( pretrained_model_name_or_path=SSL_MODEL_NAME) self._ssl_model = self._ssl_model.cuda() if shared_training == 'encoder': share_bart_encoder_layers(self._model, self._ssl_model) print( 'Dialog generation task and SSL task are using the same BART encoder.' ) else: share_bart_decoder_layers(self._model, self._ssl_model) print( 'Dialog generation task and SSL task are using the same BART decoder.' ) self._optimizer = None self._lr_scheduler = None self._global_step = 0 self._dataset = {} self._log_dir = None self._eval_steps = None self._log_file = None self._best_dev_loss = None
def test_xsum_summarization_same_as_fairseq(self): model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-xsum").to(torch_device) self.assertFalse(model.config.is_valid_mbart()) tok = BartTokenizer.from_pretrained("facebook/bart-large") PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""" EXPECTED_SUMMARY = "California's largest power company has begun shutting off power to tens of thousands of homes and businesses in the state." dct = tok.batch_encode_plus( [PGE_ARTICLE], max_length=1024, padding="max_length", truncation=True, return_tensors="pt", ).to(torch_device) hypotheses_batch = model.generate( input_ids=dct["input_ids"], attention_mask=dct["attention_mask"], num_beams=2, max_length=62, min_length=11, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True, decoder_start_token_id=model.config.eos_token_id, ) decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True,) self.assertEqual(EXPECTED_SUMMARY, decoded[0])
def load_model_tokenizer(self, pretrained): """ Load transformer model and tokenizer for given pre-trained name :param pretrained: pre-trained name :return: model, tokenizer """ model = None tokenizer = None if self.method == "T5": if pretrained in T5_PRETRAINED_MODELS: model = T5ForConditionalGeneration.from_pretrained(pretrained) tokenizer = T5Tokenizer.from_pretrained(pretrained) elif self.method == "BART": if pretrained in BART_PRETRAINED_MODELS: model = BartForConditionalGeneration.from_pretrained(pretrained) tokenizer = BartTokenizer.from_pretrained(pretrained) elif self.method == "GPT-2": if pretrained in GPT2_PRETRAINED_MODELS: model = GPT2LMHeadModel.from_pretrained(pretrained) model.config.max_length = self.max_length tokenizer = GPT2Tokenizer.from_pretrained(pretrained) elif self.method == "XLM": if pretrained in XLM_PRETRAINED_MODELS: model = XLMWithLMHeadModel.from_pretrained(pretrained) model.config.max_length = self.max_length tokenizer = XLMTokenizer.from_pretrained(pretrained) else: pass return model, tokenizer
def pre_init(self, hparams): self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) teacher = BartForConditionalGeneration.from_pretrained( hparams.teacher).eval() student_updates = { "decoder_layers": hparams.student_decoder_layers, "encoder_layers": hparams.student_encoder_layers, } if hparams.length_penalty != -1: student_updates["length_penalty"] = hparams.length_penalty d_layers_to_copy = get_layers_to_copy( student_updates["decoder_layers"], teacher.config.decoder_layers) e_layers_to_copy: List = get_layers_to_copy( student_updates["encoder_layers"], teacher.config.encoder_layers) hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = BartConfig(**kw) student = BartForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) save_dir = self.output_dir.joinpath("student") self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) student.save_pretrained(save_dir) hparams.model_name_or_path = str(save_dir) return student, student_cfg, teacher
def get_summary(text, model, tokenizer, torch_device): """ Get summary """ tokenizer_summarize = BartTokenizer.from_pretrained("bart-large-cnn") model_summarize = BartForConditionalGeneration.from_pretrained("bart-large-cnn").to( torch_device ) model_summarize.to(torch_device) # Set the model in evaluation mode to deactivate the DropOut modules model_summarize.eval() answers_input_ids = tokenizer_summarize.batch_encode_plus( [text], return_tensors="pt", max_length=1024 )["input_ids"] answers_input_ids = answers_input_ids.to(torch_device) summary_ids = model_summarize.generate( answers_input_ids, num_beams=4, max_length=5, early_stopping=True ) return tokenizer_summarize.decode( summary_ids.squeeze(), skip_special_tokens=True, clean_up_tokenization_spaces=False, )