def build_model_and_check_forward_pass(self, **kwargs): tester = T5ModelTester(self, **kwargs) config, *inputs = tester.prepare_config_and_inputs() ( input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ) = inputs model = T5ForConditionalGeneration( config=config).to(torch_device).eval() outputs = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=lm_labels, ) # outputs = model(*inputs) assert len(outputs) == 4 assert outputs["logits"].size() == (tester.batch_size, tester.decoder_seq_length, tester.vocab_size) assert outputs["loss"].size() == () return model
def forward(): params = request.get_json() sentence = params["sentence"] decoding_params = params["decoding_params"] global input_sentence input_sentence = sentence tokenizer_name = decoding_params["tokenizer"] model = T5ForConditionalGeneration.from_pretrained('Vamsi/T5_Paraphrase_Paws') tokenizer = select_tokenizer(tokenizer_name) model_output = run_model(sentence, decoding_params, tokenizer, model) paraphrases = [] temp = [] temp = preprocess_output(model_output, tokenizer, temp, sentence, decoding_params, model) global output_cache output_cache = temp for i, line in enumerate(temp): paraphrases.append(f"{i + 1}. {line}") return {"data": paraphrases}
def _load_model(self, experiment_path: str, save_as_pretrained: bool) -> None: """ Loads trained model weights and saves as a huggingface pretrained model if specified. """ logging.info("Loading model...") model_config = { "early_stopping": self.train_config["early_stopping"], "max_length": self.train_config["max_output_length"], "num_beams": self.train_config["beam_size"], "prefix": self.data_config["src_prefix"], "vocab_size": self.tokenizer.vocab_size, } self.model = T5ForConditionalGeneration.from_pretrained( self.model_config["model_size"]) self.model.config.update(model_config) checkpoint = torch.load(os.path.join(experiment_path, "best_model.pt")) self.model.load_state_dict(checkpoint["model_state"]) self.model.to(self.device) if save_as_pretrained: pretrained_path = os.path.join(experiment_path, "best_model.bin") self.model.save_pretrained(pretrained_path) logging.info( f"Loaded model saved as pretrained model in path: {pretrained_path} ! Can now be loaded with: 'model.from_pretrained(path)' " )
def __init__(self, hparams: dict) -> None: self.__max_input_length = hparams["max_input_length"] self.__max_target_length = hparams["max_target_length"] self.__model_dir = hparams["model_dir"] self.__temperature = (hparams["temperature"] if "temperature" in hparams else 1.0) self.__num_beams = (hparams["num_beams"] if "num_beams" in hparams else 10) self.__diversity_penalty = (hparams["diversity_penalty"] if "diversity_penalty" in hparams else 1.0) self.__num_beam_groups = (hparams["num_beam_groups"] if "num_beam_groups" in hparams else 10) self.__num_return_sequences = (hparams["num_return_sequences"] if "num_return_sequences" in hparams else 10) self.__repetition_penalty = (hparams["repetition_penalty"] if "repetition_penalty" in hparams else 1.5) self.__tokenizer = T5Tokenizer.from_pretrained(self.__model_dir, is_fast=True) self.__model = T5ForConditionalGeneration.from_pretrained( self.__model_dir) # GPUが利用できる場合は、GPUモードとする self.__use_gpu = torch.cuda.is_available() if self.__use_gpu: self.__model.cuda() # モデルを推論モードに設定 self.__model.eval()
def paraphraser(text): model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser') tokenizer = T5Tokenizer.from_pretrained('t5-small') sentence = text # sentence = "What are the ingredients required to bake a perfect cake?" # sentence = "What is the best possible approach to learn aeronautical engineering?" # sentence = "Do apples taste better than oranges in general?" text = "paraphrase: " + sentence + " </s>" max_len = 128 encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt") input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"] # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3 beam_outputs = model.generate( input_ids=input_ids, attention_mask=attention_masks, do_sample=True, max_length=128, top_k=120, top_p=0.98, early_stopping=True, num_return_sequences=3 ) final_outputs =[] for beam_output in beam_outputs: sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True) if sent.lower() != sentence.lower() and sent not in final_outputs: final_outputs.append(sent) return final_outputs
def __init__(self, hparams): super(T5FineTuner, self).__init__() self.hparams = hparams # self.config = T5Config(hparams.model_name_or_path,dropout_rate=0.2) self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path) # self.model.dropout_rate=0.2 self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path) if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: self.freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.step_count = 0 self.output_dir = Path(self.hparams.output_dir) n_observations_per_split = { "train": self.hparams.n_train, "validation": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()} self.em_score_list = [] self.subset_score_list =[]
def perform_generation(): """Load fine-tuned model and generate""" # import data provider (e.g. dtr, rel, or events) data = importlib.import_module(args.data_reader) # load pretrained T5 tokenizer tokenizer = T5Tokenizer.from_pretrained(args.model_name) # load the saved model model = T5ForConditionalGeneration.from_pretrained(args.model_dir) val_dataset = data.Data(xmi_dir=args.xmi_dir, tokenizer=tokenizer, max_input_length=args.max_input_length, max_output_length=args.max_output_length, partition='dev', n_files=args.n_files) val_data_loader = DataLoader(val_dataset, shuffle=False, batch_size=args.gener_batch_size) # generate output from the saved model f1 = generate(model, val_data_loader, tokenizer) print('macro f1:', f1)
def generate_summaries(lns, output_file_path, model_size, batch_size, device): output_file = Path(output_file_path).open("w") model = T5ForConditionalGeneration.from_pretrained(model_size) model.to(device) tokenizer = T5Tokenizer.from_pretrained(model_size) # update config with summarization specific params task_specific_params = model.config.task_specific_params if task_specific_params is not None: model.config.update(task_specific_params.get("summarization", {})) for batch in tqdm(list(chunks(lns, batch_size))): batch = [model.config.prefix + text for text in batch] dct = tokenizer.batch_encode_plus(batch, max_length=512, return_tensors="pt", pad_to_max_length=True) input_ids = dct["input_ids"].to(device) attention_mask = dct["attention_mask"].to(device) summaries = model.generate(input_ids=input_ids, attention_mask=attention_mask) dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries] for hypothesis in dec: output_file.write(hypothesis + "\n") output_file.flush()
def __init__(self, hparams): super(T5FineTuner, self).__init__() self.hparams = hparams self.model = T5ForConditionalGeneration.from_pretrained( hparams.model_name_or_path) self.tokenizer = T5Tokenizer.from_pretrained( hparams.tokenizer_name_or_path)
def main(args): dataset = load_dataset('cnn_dailymail', "3.0.0") train_dataset = dataset['train'].select(range(args.max_train_samples)) tokenizer = AutoTokenizer.from_pretrained('t5-small') examples = preprocess(train_dataset, tokenizer) t5 = T5ForConditionalGeneration.from_pretrained('t5-small') model = Model(t5) model = model.cuda() model = torch.nn.DataParallel(model) # model.to(args.device) model.eval() sentence_ranks = {} for inputs, label_att_mask, ids in tqdm(examples): for key in inputs: inputs[key] = inputs[key].to(args.device) # sentence_probs = get_probs(model, inputs, label_att_mask) sentence_probs = model(label_att_mask, **inputs) _, idxs = sentence_probs.sort() sentence_ranks[ids] = idxs.tolist() json.dump(sentence_ranks, open('sentence_prob_ranks2.json', 'w'))
def __init__(self, model: str = None): log.info(model) torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") log.info(torch_device) if model is None: model = "t5" self.modelName = model # path to all the files that will be used for inference self.path = f"./app/api/{model}/" self.model_path = self.path + "pytorch_model.bin" self.config_path = self.path + "config.json" # Selecting the correct model based on the passed madel input. Default t5 if model == "t5": self.config = T5Config.from_json_file(self.config_path) self.model = T5ForConditionalGeneration(self.config) self.tokenizer = T5Tokenizer.from_pretrained(self.path) self.model.eval() self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device)) elif model == "google/pegasus-newsroom": self.config = PegasusConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = PegasusTokenizer.from_pretrained(model) elif model == "facebook/bart-large-cnn": self.config = BartConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = BartTokenizer.from_pretrained(model) else: raise Exception("This model is not supported") self.text = str()
def prepare_model(self, hparams: Namespace): # print(hparams) config = AutoConfig.from_pretrained(hparams.model_type) config.output_hidden_states = True print( '*********************************** LOADING MODEL ****************************************' ) print('The model {} has {} encoder and {} decoder layers'.format( hparams.model_type, config.num_layers, config.num_layers)) model = T5ForConditionalGeneration.from_pretrained(hparams.model_type, config=config) # Set model to eval model.eval() model.requires_grad = False for param in model.parameters(): param.requires_grad = False print( '*************************************** END **********************************************' ) return model, config
def run_text_summarization(data, file_path) -> pd.DataFrame: model = T5ForConditionalGeneration.from_pretrained(var.SUMMARIZATION_MODEL) tokenizer = T5Tokenizer.from_pretrained(var.SUMMARIZATION_MODEL) data_source = data.copy() data_source['summary'] = data_source['text'].progress_apply(summarize_article, tokenizer=tokenizer, model=model) data_source.to_csv('{}-Processed-Summarized.csv'.format(file_path), index=False) return data_source
def test_translation_en_to_ro(self): model = T5ForConditionalGeneration.from_pretrained("t5-base").to( torch_device) tok = T5Tokenizer.from_pretrained("t5-base") task_specific_config = getattr(model.config, "task_specific_params", {}) translation_config = task_specific_config.get("translation_en_to_ro", {}) model.config.update(translation_config) original_input = "Taco Bell said it plans to add 2,000 locations in the US by 2022." expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022." input_ids = tok.encode(model.config.prefix + original_input, return_tensors="pt") output = model.generate( input_ids=input_ids, num_beams=4, length_penalty=2.0, max_length=50, no_repeat_ngram_size=3, do_sample=False, early_stopping=True, ) translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) self.assertEqual(translation, expected_translation)
def construct_t5(options: DocumentRankingEvaluationOptions) -> Reranker: device = torch.device(options.device) model = T5ForConditionalGeneration.from_pretrained( options.model, from_tf=options.from_tf).to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.model_type) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) return T5Reranker(model, tokenizer)
def test_translation_en_to_de(self): model = T5ForConditionalGeneration.from_pretrained("t5-base").to( torch_device) tok = T5Tokenizer.from_pretrained("t5-base") task_specific_config = getattr(model.config, "task_specific_params", {}) translation_config = task_specific_config.get("translation_en_to_de", {}) model.config.update(translation_config) original_input = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.' expected_translation = ( '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.' ) input_ids = tok.encode(model.config.prefix + original_input, return_tensors="pt") output = model.generate( input_ids=input_ids, num_beams=4, length_penalty=2.0, max_length=50, no_repeat_ngram_size=3, do_sample=False, early_stopping=True, ) translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) self.assertEqual(translation, expected_translation)
def test_translation_en_to_fr(self): model = T5ForConditionalGeneration.from_pretrained("t5-base").to( torch_device) tok = T5Tokenizer.from_pretrained("t5-base") task_specific_config = getattr(model.config, "task_specific_params", {}) translation_config = task_specific_config.get("translation_en_to_fr", {}) model.config.update(translation_config) original_input = 'This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots, while more difficult to identify are the pink-coloured "new-borns" in the star delivery room.' expected_translation = "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre un « portrait familial » de générations innombrables de étoiles : les plus anciennes sont observées sous forme de pointes bleues, alors que les « nouveau-nés » de couleur rose dans la salle des accouchements doivent être plus difficiles " input_ids = tok.encode(model.config.prefix + original_input, return_tensors="pt") output = model.generate( input_ids=input_ids, num_beams=4, length_penalty=2.0, max_length=100, no_repeat_ngram_size=3, do_sample=False, early_stopping=True, ) translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) self.assertEqual(translation, expected_translation)
def create_t5_and_check_t5_generate_with_past_key_value_states( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5ForConditionalGeneration(config=config) model.to(torch_device) model.eval() torch.manual_seed(0) output_without_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False) torch.manual_seed(0) output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True) self.parent.assertTrue( torch.all(output_with_past_cache == output_without_past_cache))
def create_and_check_t5_with_lm_head( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5ForConditionalGeneration(config=config) model.to(torch_device) model.eval() outputs = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, lm_labels=lm_labels, ) loss, prediction_scores, _, _ = outputs self.parent.assertEqual(len(outputs), 4) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]) self.check_loss_output(result)
def t5_zeroshot(): model = T5ForConditionalGeneration.from_pretrained('t5-small') dataset = load_dataset('cnn_dailymail', '3.0.0') val_dataset = dataset['validation'] inputs = val_dataset['article'] targets = val_dataset['highlights'] device = torch.device('cuda:1') model = model.to(device) tokenizer = AutoTokenizer.from_pretrained('t5-small') predictions = [] for article in tqdm(inputs): inp = tokenizer("summarize: " + article, return_tensors="pt", padding="max_length", max_length=512).to(device) summary_ids = model.generate(inp.input_ids, num_beams=1, no_repeat_ngram_size=3, min_length=10, max_length=128, length_penalty=2.0) output = tokenizer.decode(summary_ids[0], skip_special_tokens=True) predictions.append(output) print(compute_rouge_score(predictions, targets))
def __init__(self, model_dir, **kwargs): default_device = 'cuda:0' if torch.cuda.is_available() else 'cpu' device = kwargs.get('device', default_device) self.device = torch.device(device) # The following produces a logger warning that we can ignore so eliminate temporarily set the level higher xfm_logger = logging.getLogger('transformers.modeling_utils') original_level = xfm_logger.getEffectiveLevel() xfm_logger.setLevel(logging.ERROR) self.model = T5ForConditionalGeneration.from_pretrained(model_dir).to( self.device) xfm_logger.setLevel(original_level) # End logger ignore warning self.max_graph_len = self.model.config.task_specific_params[ 'translation_amr_to_text']['max_in_len'] self.max_sent_len = self.model.config.task_specific_params[ 'translation_amr_to_text']['max_out_len'] tokenizer_name = kwargs.get('tokenizer_name', 't5-base') # name or path self.tokenizer = T5Tokenizer.from_pretrained(tokenizer_name) self.seq_ends = set( [self.tokenizer.eos_token_id, self.tokenizer.pad_token_id]) self.batch_size = kwargs.get('batch_size', 32) self.num_beams = kwargs.get('num_beams', 1) # 1 => greedy self.num_ret_seq = kwargs.get('num_ret_seq', 1) if self.num_ret_seq > self.num_beams: logger.warn( 'Need at least as many beams as returned sequences - increasing beam count' ) self.num_beams = self.num_ret_seq
def __init__(self, hparams): super().__init__() self.hparams = hparams self.use_radam = getattr(self.hparams, "use_radam", False) self.cnnt5_only = getattr(self.hparams, "cnnt5_only", False) self.hparams.tgt_seq_len = getattr(self.hparams, "tgt_seq_len", self.hparams.seq_len) if not self.cnnt5_only: if not self.hparams.t5_only: print("Initializing LayoutLM...") self.encoder = LayoutLMModel.from_pretrained( self.hparams.layoutlm_str) if self.hparams.freeze_layoutlm: for param in tqdm(self.encoder.parameters(), desc="Freezing LayoutLM...", leave=True): param.requires_grad = False print("Initializing T5...") self.t5 = T5ForConditionalGeneration.from_pretrained( self.hparams.t5_str) self.use_llm_emb = getattr(self.hparams, "llm_emb", False) if self.use_llm_emb: print("Initializing layoutlm embeddings") self.llm_emb = LayoutLMEmbeddings( LayoutLMModel.from_pretrained( self.hparams.layoutlm_str).config) if not self.hparams.no_image: print("Using images, CNNT5 small initialized.") self.cnnt5 = CNNT5({ "t5": "t5-small", "pre_train": False, "initial_ckpt": "models/wikipedia_pre_train_continue-epoch=1-val_exact_match=0.58-val_f1=0.98.ckpt", "seq_len": self.hparams.seq_len, "tgt_seq_len": self.hparams.tgt_seq_len }) if self.cnnt5_only: print("Fine-tuning CNNT5.") else: for param in tqdm( self.cnnt5.parameters(), desc= "Freezing CNNT5 as an image Embedding extractor...", leave=True): param.requires_grad = False self.adapt_cnnt5_features = nn.Linear(512, 768) if self.hparams.t5_only: self.tokenizer = T5Tokenizer.from_pretrained(self.hparams.t5_str) elif self.cnnt5_only: self.tokenizer = self.cnnt5.tokenizer else: self.tokenizer = LayoutLMTokenizer.from_pretrained( self.hparams.layoutlm_str) self.detokenizer = T5Tokenizer.from_pretrained(self.hparams.t5_str)
def get_model(tokenizer_len=None): if args.mode == 'train' or args.mode == 'test_without_train': model = T5ForConditionalGeneration.from_pretrained( args.t5_model, cache_dir=args.cache_dir) if tokenizer_len is not None: model.resize_token_embeddings(tokenizer_len) elif args.mode == 'test' or args.mode == 'continue_train': model = T5ForConditionalGeneration( T5Config.from_json_file(output_config_file)) model.load_state_dict(torch.load(output_model_file)) else: raise NotImplementedError( f'No such mode called {args.mode}, error raised from get_model.') if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) return model.to(device)
def model_fn(model_dir): logger.info('reading model.') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("================ objects in model_dir ===================") print(os.listdir(model_dir)) model = T5ForConditionalGeneration.from_pretrained(model_dir) print("================ model loaded ===========================") return model.to(device)
def get_t5_model(name='t5-small'): """Instantiates the model and collation function for T5.""" model = T5ForConditionalGeneration.from_pretrained(name, num_labels=1) tokenizer = T5Tokenizer.from_pretrained(name) collate_fn = SummarizationCollateFn(inputs_col='Body', outputs_col='Title', tokenizer=tokenizer) return model, collate_fn
def __init__(self): super().__init__() PRETRAINED_MODEL_NAME = 't5-small' from transformers import T5Tokenizer, T5ForConditionalGeneration self.t5_tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME) self.t5_model = T5ForConditionalGeneration.from_pretrained( PRETRAINED_MODEL_NAME, return_dict=True)
def prepare_model(self, condition_generation=False): if condition_generation: self.model = T5ForConditionalGeneration.from_pretrained('t5-base') else: t5_model = T5Model.from_pretrained('t5-base') self.model = GenerationModel(t5_model) self.load_checkpoint() self.model = self.model.cuda()
def load_model(self): model = T5ForConditionalGeneration.from_pretrained( os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'model')) tokenizer = T5Tokenizer.from_pretrained( os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'tokenizer')) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) return model, tokenizer, device
def get_model( pretrained_model_name_or_path: str = 'castorini/duot5-base-msmarco', *args, device: str = None, **kwargs) -> T5ForConditionalGeneration: device = device or ('cuda' if torch.cuda.is_available() else 'cpu') device = torch.device(device) return T5ForConditionalGeneration.from_pretrained( pretrained_model_name_or_path, *args, **kwargs).to(device).eval()
def __init__(self): self.tokenizer = T5Tokenizer.from_pretrained('t5-base') model = T5ForConditionalGeneration.from_pretrained('Parth/boolean') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() self.device = device self.model = model self.set_seed(42)