def test_t5(): train_data = [ ["convert", "one", "1"], ["convert", "two", "2"], ] train_df = pd.DataFrame(train_data, columns=["prefix", "input_text", "target_text"]) eval_data = [ ["convert", "three", "3"], ["convert", "four", "4"], ] eval_df = pd.DataFrame(eval_data, columns=["prefix", "input_text", "target_text"]) eval_df = train_df.copy() model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 10, "train_batch_size": 2, "num_train_epochs": 2, "save_model_every_epoch": False, "max_length": 20, "num_beams": 1, } # Create T5 Model model = T5Model("t5", "t5-base", args=model_args, use_cuda=False) # Train T5 Model on new task model.train_model(train_df) # Evaluate T5 Model on new task model.eval_model(eval_df) # Predict with trained T5 model model.predict(["convert: four", "convert: five"]) # Load test model = T5Model("t5", "outputs", args=model_args, use_cuda=False) # Evaluate T5 Model on new task model.eval_model(eval_df) # Predict with trained T5 model model.predict(["convert: four", "convert: five"])
def create_model(model_class, model_type, model_name, num_labels, weight, args, use_cuda, cuda_device, **kwargs): if model_class == "ClassificationModel": return ClassificationModel(model_type, model_name, num_labels, weight, args, use_cuda, cuda_device, **kwargs) elif model_class == "MultiLabelClassificationModel": return MultiLabelClassificationModel(model_type, model_name, num_labels, weight, args, use_cuda, cuda_device, **kwargs) elif model_class == "QuestionAnsweringModel": return QuestionAnsweringModel(model_type, model_name, args, use_cuda, cuda_device, **kwargs) elif model_class == "NERModel": return NERModel(model_type, model_name, args=args, use_cuda=use_cuda, cuda_device=cuda_device, **kwargs) elif model_class == "T5Model": args = T5Args() args.use_multiprocessed_decoding = False return T5Model(model_type, model_name, args=args, use_cuda=use_cuda, cuda_device=cuda_device, **kwargs) else: raise ValueError( "{} is either invalid or not yet implemented.".format(model_class))
def __init__(self, model_path: str = config.DEFAULT_MODEL_PATH, model_architecture: str = config.MODEL_ARCHITECTURE, use_cuda: bool = config.GPU): ''' Constructs all the necessary attributes for the MT5_Translator object. Parameters ---------- model_path : str path to the mt5_translator model model_architecture : str model architecture (mt5, t5 ...) use_cuda : bool whether to use CUDA or not (if available) ''' logging.info("Loading model...") self.model_path = model_path self.use_cuda = use_cuda self.device = True \ if torch.cuda.is_available() and self.use_cuda else False self.model_args = T5Args() self.model_args.max_length = 512 self.model_args.length_penalty = 1 self.model_args.num_beams = 10 self.model = T5Model("mt5", self.model_path, args=self.model_args, use_cuda=self.device) logging.info(f"Use CUDA: {self.device}") logging.info(f"Num GPUs Available: {torch.cuda.device_count()}") logging.info(f"Model loaded")
def paraphrase(text, cuda=False): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.ERROR) model = T5Model(model_type="t5", model_name="outputs", use_cuda=cuda) output = [] # predicts paraphrases for every paragraph for paragraph in text: # add prefix predict_to = ["paraphrase: " + paragraph] preds = model.predict(predict_to) print("---------------------------------------------------------") print("Predictions >>>") result = [] # appends damerau distance to every prediction and compares it with original for pred in preds[0]: result.append([pred, damerau.distance(paragraph, pred)]) # result.append([pred, jarowinkler.distance(paragraph, pred)]) print(pred) print(damerau.distance(paragraph, pred)) # picks the most diversified prediction print("---------------------------------------------------------") best_pred = max(result, key=lambda x: x[1])[0] output.append(best_pred) print(best_pred) # outputs the total damerau distance and the paraphrased text print(*output, sep="\n") print("Diversified by: ", damerau.distance("".join(text), "".join(output))) return output
def get_model(): model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 200, "train_batch_size": 16, "num_train_epochs": 10, "evaluate_during_training": True, "evaluate_during_training_steps": 500, } # Create T5 Model model = T5Model(model_name="t5-small", model_type='t5', args=model_args, use_cuda=True) return model
def recommend(abstract: str): from simpletransformers.t5 import T5Model model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 256, "eval_batch_size": 128, "num_train_epochs": 1, "save_eval_checkpoints": False, "use_multiprocessing": False, "num_beams": None, "do_sample": True, "max_length": 50, "top_k": 50, "top_p": 0.95, "num_return_sequences": 3, } model = T5Model("t5","./checkpoint_15000_1", args=model_args,use_cuda=False) abss =["summarize: "+abstract] predicted_title = model.predict(abss) return predicted_title
model_args = T5Args() model_args.max_length = 50 model_args.length_penalty = 2.0 model_args.repetition_penalty = 2.0 model_args.num_beams = 5 model_args.early_stopping = True #model_args.do_sample = True #model_args.top_p = 0.3 model_args.num_return_sequences = 5 #model = T5Model("mt5", "persiannlp/mt5-base-parsinlu-opus-translation_fa_en", args=model_args) model_name = "outputs_t5_small_full_2020/" task = "xWant" model = T5Model("t5", model_name, args=model_args) print("predicting") #print(model.predict(["xReact: Ali buys a book", # "xReact: Ali fell on his knees"])) # + df = pd.read_csv("data/eval.tsv", sep="\t").astype(str) # Prepare the data for testing #df = df.groupby('input_text')['target_text'].apply(list) def my_eval(df, prefix): df = df.groupby(['prefix','input_text'], as_index=False).agg({'target_text':lambda x: list(x)}) truth_values = df.loc[df["prefix"] == prefix]["target_text"].tolist() input_values = df.loc[df["prefix"] == prefix]["input_text"].tolist() input_values = [prefix + ": " + str(input_text) for input_text in input_values]
from simpletransformers.t5 import T5Model model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 128, "eval_batch_size": 16, "num_train_epochs": 1, "save_eval_checkpoints": False, "use_multiprocessing": False, # "silent": True, "num_beams": None, "do_sample": True, "max_length": 50, "top_k": 50, "top_p": 0.95, "num_return_sequences": 3, } model = T5Model("outputs/best_model", args=model_args) query = "ask_question: " + """ ANTIQUE CAST METAL 3 GLOBE CANDLABRA JADITE LAMP. Stunning antique lamp with three candle style globes. Cast metal base with jadite green glass insert. Has been rewired with new braided cord. In excellent condition with only one chip (as pictured) on the edge of the glass insert. E9 69 on underside of metal base. Missing finial. New low wattage globes. """ preds = model.predict([query]) print(preds)
from simpletransformers.t5 import T5Model import argparse parser = argparse.ArgumentParser() parser.add_argument("--checkpoints", "-ckpts", help="checkpoints for t5 base",type = str, default = './best_model') parser.add_argument("--abstract", "-abs", help="abstract to generate title",type = str) args = parser.parse_args() model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 256, "eval_batch_size": 128, "num_train_epochs": 1, "save_eval_checkpoints": False, "use_multiprocessing": False, "num_beams": None, "do_sample": True, "max_length": 50, "top_k": 50, "top_p": 0.95, "num_return_sequences": 3, } with open(args.abstract) as f: data = f.read() model = T5Model("t5",args.checkpoints, args=model_args,use_cuda=False) abss =["summarize: "+data] predicted_title = model.predict(abss) print(predicted_title)
train_df = pd.read_csv("data/train_df_3.tsv", sep="\t").astype(str) eval_df = pd.read_csv("data/eval_df_3.tsv", sep="\t").astype(str) model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 128, "train_batch_size": 1, "num_train_epochs": 1, "save_eval_checkpoints": True, "save_steps": -1, "use_multiprocessing": False, # "silent": True, "evaluate_during_training": True, "evaluate_during_training_steps": 1500, "evaluate_during_training_verbose": True, "fp16": False, "wandb_project": "Question Generation with T5", } model = T5Model("mt5", "google/mt5-small", use_cuda=False, args=model_args) if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() try: model.train_model(train_df, eval_data=eval_df) except: if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache()
"overwrite_output_dir": True, "max_seq_length": 256, "eval_batch_size": 128, "num_train_epochs": 1, "save_eval_checkpoints": False, "use_multiprocessing": False, "num_beams": None, "do_sample": True, "max_length": 50, "top_k": 50, "top_p": 0.95, "num_return_sequences": 3, } model = T5Model("t5", "/content/drive/My Drive/outputs/best_model", args=model_args) abstr = [ "summarize: " + """Transfer learning, where a model is first pre-trained on a data-rich task before being finetuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our data set, pre-trained models, and code.""" ]
model_args = { "overwrite_output_dir": True, "max_seq_length": 196, "eval_batch_size": 32, "use_multiprocessing": False, "num_beams": None, "do_sample": True, "max_length": 50, "top_k": 50, "top_p": 0.95, "num_return_sequences": 3, } # Load the trained model model = T5Model("outputs", args=model_args) # Load the evaluation data df = pd.read_csv("data/train.tsv", sep="\t").astype(str) # Prepare the data for testing to_predict = [ prefix + ": " + str(input_text) for prefix, input_text in zip( df["prefix"].tolist(), df["input_text"].tolist()) ] truth = df["target_text"].tolist() tasks = df["prefix"].tolist() # Get the model predictions preds = model.predict(to_predict)
import pandas as pd from simpletransformers.t5 import T5Model train_df = pd.read_csv("data/train_df.tsv", sep="\t").astype(str) eval_df = pd.read_csv("data/eval_df.tsv", sep="\t").astype(str) model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 16, "train_batch_size": 8, "num_train_epochs": 1, "save_eval_checkpoints": True, "save_steps": -1, "use_multiprocessing": False, "evaluate_during_training": True, "evaluate_during_training_steps": 15000, "evaluate_during_training_verbose": True, "fp16": False, "wandb_project": "Question Generation with T5", } model = T5Model("t5-small", args=model_args) model.train_model(train_df, eval_data=eval_df)
from simpletransformers.t5 import T5Model, T5Args source = "english" target = "spanish" logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model_args = T5Args() model_args.max_length = 512 model_args.min_length = 0 model_args.length_penalty = 1 model_args.num_beams = 10 model = T5Model("mt5", "outputs", args=model_args) eval_df = pd.read_csv("data/eng-spa/eval.tsv", sep="\t").astype(str) target_truth = [ eval_df.loc[eval_df["prefix"] == f"translate {source} to {target}"] ["target_text"].tolist() ] to_target = eval_df.loc[ eval_df["prefix"] == f"translate {source} to {target}"]["input_text"].tolist() # print(to_target[:2]) sentences = [ "Me gusta tocar muchos instrumentos. Adoro la música", "I like to play many instruments. I love music"
import pandas as pd from simpletransformers.t5 import T5Model train_df = pd.read_csv("data/train.tsv", sep="\t").astype(str) eval_df = pd.read_csv("data/eval.tsv", sep="\t").astype(str) model_args = { "max_seq_length": 196, "train_batch_size": 16, "eval_batch_size": 64, "num_train_epochs": 1, "evaluate_during_training": True, "evaluate_during_training_steps": 15000, "evaluate_during_training_verbose": True, "use_multiprocessing": False, "fp16": False, "save_steps": -1, "save_eval_checkpoints": False, "save_model_every_epoch": False, "reprocess_input_data": True, "overwrite_output_dir": True, "wandb_project": "T5 mixed tasks - Binary, Multi-Label, Regression", } model = T5Model("t5-base", args=model_args) model.train_model(train_df, eval_data=eval_df)
import logging import sacrebleu import pandas as pd from simpletransformers.t5 import T5Model, T5Args logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model_args = T5Args() model_args.max_length = 512 model_args.length_penalty = 1 model_args.num_beams = 10 model = T5Model("mt5", "outputs_base", args=model_args) eval_df = pd.read_csv("data/eval.tsv", sep="\t").astype(str) sinhala_truth = [ eval_df.loc[eval_df["prefix"] == "translate english to sinhala"] ["target_text"].tolist() ] to_sinhala = eval_df.loc[eval_df["prefix"] == "translate english to sinhala"][ "input_text"].tolist() english_truth = [ eval_df.loc[eval_df["prefix"] == "translate sinhala to english"] ["target_text"].tolist() ] to_english = eval_df.loc[eval_df["prefix"] == "translate sinhala to english"][ "input_text"].tolist()
import pandas as pd from simpletransformers.t5 import T5Model train_df = pd.read_csv("data/train_df.tsv", sep="\t").astype(str) eval_df = pd.read_csv("data/eval_df.tsv", sep="\t").astype(str) model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 128, "train_batch_size": 8, "num_train_epochs": 1, "save_eval_checkpoints": True, "save_steps": -1, "use_multiprocessing": False, # "silent": True, "evaluate_during_training": True, "evaluate_during_training_steps": 15000, "evaluate_during_training_verbose": True, "fp16": False, "wandb_project": "Question Generation with T5", } model = T5Model("t5-large", args=model_args) model.train_model(train_df, eval_data=eval_df)
"reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 128, "eval_batch_size": 128, "num_train_epochs": 1, "save_eval_checkpoints": False, "use_multiprocessing": False, "num_beams": None, "do_sample": True, "max_length": 50, "top_k": 50, "top_p": 0.95, "num_return_sequences": 3, } model = T5Model("test_outputs_large/best_model", args=model_args) df = pd.read_csv("data/eval_df.tsv", sep="\t").astype(str) preds = model.predict([ "ask_question: " + description for description in df["input_text"].tolist() ]) questions = df["target_text"].tolist() with open("test_outputs_large/generated_questions.txt", "w") as f: for i, desc in enumerate(df["input_text"].tolist()): pprint(desc) pprint(preds[i]) print()
def main(): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the source and target files for the task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type, choose from [seq2seq, T5]", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) # Other parameters parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the valid set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run prediction on the test set.") parser.add_argument("--init_model_weights", action="store_true", help="Whether to initialize the model weights") parser.add_argument("--overwrite_output_dir", action="store_true", help="Whether to overwrite on the existing output dir") parser.add_argument("--use_multiprocessed_decoding", action="store_true", help="Whether to use multiprocess when decoding") parser.add_argument( "--save_model_every_epoch", action="store_true", help="Whether to save model every epoch during training") parser.add_argument( "--predict_during_training", action="store_true", help="Whether to predict after each checkpoint-saving during training") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to evaluate after each checkpoint-saving during training" ) parser.add_argument( "--output_dir", default='output_dir/', type=str, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--save_step", default=0, type=int, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--train_batch_size", default=16, type=int, help="Size of each train batch", ) parser.add_argument( "--eval_batch_size", default=16, type=int, help="Size of each eval/predict batch", ) parser.add_argument( "--gradient_accumulation_steps", default=1, type=int, help="gradient accumulation steps", ) parser.add_argument( "--learning_rate", default=4e-5, type=float, help="learning rate", ) parser.add_argument( "--num_train_epochs", default=100, type=int, help="Number of train epochs", ) parser.add_argument( "--max_seq_length", default=None, type=int, help="Max input seq length", ) parser.add_argument( "--max_length", default=None, type=int, help="Max output seq length", ) parser.add_argument( "--prediction_dir", default=None, type=str, help= "The output directory where the predictions results will be written.", ) parser.add_argument( "--prediction_suffix", default=None, type=str, help=" The supplementary suffix of prediction results name.", ) parser.add_argument( "--mask_ratio", default=0.0, type=float, help="the proportion of masked words in the source", ) parser.add_argument( "--mask_length", default="span-poisson", type=str, choices=['subword', 'word', 'span-poisson'], help="when masking words, the length of mask segments", ) parser.add_argument( '--replace_length', default=-1, type=int, help= 'when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)') parser.add_argument( '--poisson_lambda', default=3.0, type=float, help='randomly shuffle sentences for this proportion of inputs') parser.add_argument( '--dataloader_num_workers', default=0, type=int, help='the number of cpus used in collecting data in dataloader, ' 'note that if it is large than cpu number, the program may be stuck') parser.add_argument( '--evaluation_metric', default='qa', type=str, help='if pretrain passages, use \'passage\', else use \'qa\'') args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if args.do_train == True: train_df = read_data_source_target(args.data_dir + "train.source", args.data_dir + "train.target") else: train_df = None if args.do_eval == True or args.evaluate_during_training == True: eval_df = read_data_source_target(args.data_dir + "valid.source", args.data_dir + "valid.target") else: eval_df = None if args.do_predict == True or args.predict_during_training == True: test_df = read_data_source_target(args.data_dir + "test.source", args.data_dir + "test.target") else: test_df = None model_args = { "reprocess_input_data": True, "overwrite_output_dir": args.overwrite_output_dir, "init_model_weights": args.init_model_weights, "max_seq_length": args.max_seq_length, "train_batch_size": args.train_batch_size, "eval_batch_size": args.eval_batch_size, "gradient_accumulation_steps": args.gradient_accumulation_steps, "learning_rate": args.learning_rate, "num_train_epochs": args.num_train_epochs, "save_eval_checkpoints": False, "save_model_every_epoch": args.save_model_every_epoch, "save_steps": args.save_step, "evaluate_during_training": args.evaluate_during_training, "evaluate_generated_text": True, "evaluate_during_training_verbose": True, "predict_during_training": args.predict_during_training, "use_multiprocessing": False, "output_dir": args.output_dir, "max_length": args.max_length, "manual_seed": 4, "mask_ratio": args.mask_ratio, "mask_length": args.mask_length, "replace_length": args.replace_length, "poisson_lambda": args.poisson_lambda, "fp16": False, "truncation": True, "dataloader_num_workers": args.dataloader_num_workers, "use_multiprocessed_decoding": args.use_multiprocessed_decoding, "evaluation_metric": args.evaluation_metric } # Initialize model if args.model_type == 'seq2seq': model = Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name=args.model_name_or_path, args=model_args, ) elif args.model_type == 't5': model = T5Model( model_name=args.model_name_or_path, args=model_args, ) else: raise ValueError("The {} model is not supported now".format( args.model_type)) # Train the model if args.do_train == True: model.train_model(train_data=train_df, eval_data=eval_df, test_data=test_df, output_dir=args.output_dir) # Evaluate the model if args.do_eval == True: results = model.eval_model(eval_data=eval_df) print(results) # Use the model for prediction if args.do_predict == True: print( model.predict(pred_data=test_df, output_dir=args.prediction_dir, suffix=args.prediction_suffix))
train_df = pd.read_csv("data/eng-spa/train.tsv", sep="\t").astype(str) eval_df = pd.read_csv("data/eng-spa/eval.tsv", sep="\t").astype(str) train_df["prefix"] = "" eval_df["prefix"] = "" model_args = T5Args() model_args.max_seq_length = 80 model_args.train_batch_size = 10 model_args.eval_batch_size = 10 model_args.num_train_epochs = 1 model_args.evaluate_during_training = True model_args.evaluate_during_training_steps = 30000 model_args.use_multiprocessing = False model_args.fp16 = False model_args.save_steps = -1 model_args.save_eval_checkpoints = False model_args.no_cache = True model_args.reprocess_input_data = True model_args.overwrite_output_dir = True model_args.preprocess_inputs = False model_args.num_return_sequences = 1 model_args.wandb_project = "MT5 English-Spanish Translation" model = T5Model("mt5", "google/mt5-base", args=model_args) # Train the model model.train_model(train_df, eval_data=eval_df) # Optional: Evaluate the model. We'll test it properly anyway. results = model.eval_model(eval_df, verbose=True)
transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) train_df['prefix'] = "summarize" eval_df['prefix'] = "summarize" model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 512, "train_batch_size": 16, "num_train_epochs": 4, } # Create T5 Model model = T5Model("t5", "t5-small", args=model_args, use_cuda=True) # Train T5 Model on new task model.train_model(train_df) # Evaluate T5 Model on new task results = model.eval_model(eval_df) # Predict with trained T5 model #print(model.predict(["convert: four"])) # In[15]: results # ## And We're Done !