def main(): """Fine-tune on summarization data""" # need this to save a fine-tuned model if os.path.isdir(args.model_dir): shutil.rmtree(args.model_dir) os.mkdir(args.model_dir) # import data provider (e.g. dtr, rel, or events) data = importlib.import_module(args.data_reader) # load pretrained T5 tokenizer tokenizer = T5Tokenizer.from_pretrained(args.model_name) # load a pretrained T5 model model = T5ForConditionalGeneration.from_pretrained(args.model_name) train_dataset = data.Data(xmi_dir=args.xmi_dir, tokenizer=tokenizer, max_input_length=args.max_input_length, max_output_length=args.max_output_length, partition='train', n_files=args.n_files, xml_ref_dir=None, xml_out_dir=None) val_dataset = data.Data(xmi_dir=args.xmi_dir, tokenizer=tokenizer, max_input_length=args.max_input_length, max_output_length=args.max_output_length, partition='dev', n_files=args.n_files, xml_ref_dir=None, xml_out_dir=None) training_args = Seq2SeqTrainingArguments( output_dir='./Results', num_train_epochs=args.n_epochs, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, learning_rate=args.learning_rate, warmup_steps=100, weight_decay=0.01, logging_dir='./Logs') trainer = Seq2SeqTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset) trainer.train() trainer.save_model(args.model_dir) trainer.evaluate()
def train_evaluate(model, collate_fn, train_dataset, val_dataset, **kwargs): train_args = Seq2SeqTrainingArguments(**{ **default_training_args, **kwargs }) trainer = Seq2SeqTrainer(model=model, args=train_args, data_collator=collate_fn, train_dataset=train_dataset, eval_dataset=val_dataset) trainer.train() results = trainer.evaluate() return results
def instantiate_trainer(config): verbosity = config.get("verbosity", logging.INFO) t_logging.set_verbosity(verbosity) logger.setLevel(verbosity) # debug (see torch.autograd.detect_anomaly) set_detect_anomaly(bool(config.get("debug", False))) # model model_args = dict(name="ViT-B/32", jit=False, training=True, Class="CLIPDecoder") model_args.update(config.get("model", {})) model_args["Class"] = getattr(clip.model, model_args["Class"]) logger.info(f"loading model from pre-trained CLIP {model_args}...") model, image_preprocess = load(**model_args) # data train_dataset, eval_dataset = get_datasets( image_preprocess=image_preprocess, **config.get("dataset", {})) # training criterion_args = config.get("criterion", {}) # get criterion class (e.g. nn.NLLLoss) by name CriterionClass = getattr(nn, criterion_args.pop("Class", "NLLLoss")) criterion = CriterionClass(**criterion_args) learner_args = config.get("learner", {}) LearnerClass = getattr(sys.modules[__name__], learner_args.pop("Class", "LanguageModel")) learner = LearnerClass(model, criterion) training_args = Seq2SeqTrainingArguments(**config.get("training", {})) trainer = CLIPTrainer(model=learner, args=training_args, data_collator=collate_batch, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics) # training callbacks for callback in config.get("callbacks", []): CallbackClass = getattr(trainer_callback, callback.pop("Class")) trainer.add_callback(CallbackClass(**callback)) return trainer, training_args, config
def generate(self, dataset): huggingface_model = self.convert_to_huggingface() huggingface_model.config.decoder_start_token_id = self.tokenizer.cls_token_id huggingface_model.config.eos_token_id = self.tokenizer.sep_token_id huggingface_model.config.pad_token_id = self.tokenizer.pad_token_id huggingface_model.config.vocab_size = huggingface_model.config.encoder.vocab_size huggingface_model.config.add_cross_attention = True huggingface_model.config.no_repeat_ngram_size = 3 huggingface_model.config.early_stopping = True huggingface_model.config.length_penalty = 2.0 huggingface_model.config.num_beams = 4 util_args = Seq2SeqTrainingArguments(predict_with_generate=True, output_dir='./tmp') util = Seq2SeqTrainer( args=util_args, model=huggingface_model, compute_metrics=lambda pred: compute_metrics(pred, self.tokenizer), eval_dataset=dataset, tokenizer=self.tokenizer, ) return util.predict(dataset)
batch_size = 1 # change to 16 for full training bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained( 'bert-base-multilingual-cased', 'bert-base-multilingual-cased' ) # initialize Bert2Bert from pre-trained checkpoints bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id bert2bert.config.eos_token_id = tokenizer.eos_token_id bert2bert.config.pad_token_id = tokenizer.pad_token_id bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size training_args = Seq2SeqTrainingArguments( output_dir="./", overwrite_output_dir=True, do_train=True, per_device_train_batch_size=batch_size, num_train_epochs=1, logging_steps=500, # set to 1000 for full training save_steps=10000, # set to 500 for full training warmup_steps=1000, # set to 2000 for full training fp16=True) trainer = Seq2SeqTrainer( model=bert2bert, args=training_args, train_dataset=train_data, #eval_dataset=valid_data, #compute_metrics=metric, ) trainer.train()
def main(): parser = argparse.ArgumentParser() # Input and output configs parser.add_argument("--data_folder", default=None, type=str, required=True, help="the folder to save the processed data") parser.add_argument("--last_utterance_only", default=False, required=False, action="store_true", help="Train with the whole context or the last utterance only") args = parser.parse_args() logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.StreamHandler() ] ) tasks = ['mantis', 'msdialog', 'ubuntu_dstc8'] #Downloading Conversation Response Ranking for task in tasks: if not os.path.isdir(args.data_folder+task): logging.info("Starting downloader for task {}".format(task)) dataDownloader = downloader.DataDownloader(task, args.data_folder) dataDownloader.download_and_preprocess() all_df = [] for task in tasks: train = pd.read_csv(args.data_folder+task+"/train.tsv", sep="\t") train['task'] = task replace = train.shape[0]<80000 train = train.sample(80000, replace=replace) all_df.append(train) all_df = pd.concat(all_df) def preprocess_response(r): # some tokens that only appear in MSDialogue are removed here r = r.replace("<<<AGENT>>>:", "") r = r.replace("PERSON_PLACEHOLDER", "") r = r.replace("AGENT", "") return r def preprocess_context(r): #removes beginning of context and keeps only last utterance. if 'msdialog' in r['task']: context = r['context'].split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[0].strip() else: context = r['context'].split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[-2].strip() return context all_df["response"] = all_df.apply(lambda r,f=preprocess_response: f(r['response']), axis=1) if args.last_utterance_only: all_df["context"] = all_df.apply(lambda r,f=preprocess_context: f(r), axis=1) dataset = Dataset.from_pandas(all_df) # all_df["len_context"] = all_df.apply(lambda r: len(r['context'].split(" ")), axis=1) # all_df["len_response"] = all_df.apply(lambda r: len(r['response'].split(" ")), axis=1) model_checkpoint = "t5-base" #["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"] tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) max_input_length = 100 if args.last_utterance_only: max_target_length = 100 else: max_target_length = 400 col_from = "response" col_to = "context" def preprocess_function(examples): inputs = [preprocess_response(doc) for doc in examples[col_from]] model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer([t for t in examples[col_to]], max_length=max_target_length, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_datasets = dataset.map(preprocess_function, batched=True) model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) batch_size = 5 train_args = Seq2SeqTrainingArguments( "response2context_lu_{}".format(args.last_utterance_only), learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=2, predict_with_generate=True, seed=42 ) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) trainer = Seq2SeqTrainer( model, train_args, train_dataset=tokenized_datasets, data_collator=data_collator, tokenizer=tokenizer ) print("Fine-tuning T5.") trainer.train() if args.last_utterance_only: model.save_pretrained("{}/{}_response2context_last_utt_only".format(args.data_folder, model_checkpoint)) else: model.save_pretrained("{}/{}_response2context".format(args.data_folder, model_checkpoint))
def main(): args = parse_args() # this requires having preprocessed framenet # using `frame.cli:preprocess-framenet paths = data_paths(args.data) dataset = load_dataset('json', data_files=paths) metric = load_metric("rouge") train_test = dataset["train"].train_test_split(test_size=0.1) test_valid = train_test["test"].train_test_split(test_size=0.5) datasets = DatasetDict({ "train": train_test["train"], "test": test_valid["test"], "valid": test_valid["train"] }) tokenizer = AutoTokenizer.from_pretrained(args.model) # the family of t5 models expect input sentences to be prefixed with `"summarize: "` if args.model in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]: prefix = "summarize: " else: prefix = "" # HuggingFace loves to use closures, I would prefer this # be refactored into the library, but going this route for simplicity def preprocess_function(examples): """Tokenize the data for Seq2Seq Maps over all the examples in the dataset to tokenize both the input framenet sentences and the target frame definitions. Args: examples: samples in the dataset """ inputs = [prefix + sent for sent in examples["sentence"]] model_inputs = tokenizer(inputs, max_length=args.max_input_length, truncation=True) with tokenizer.as_target_tokenizer(): labels = tokenizer(examples["frame_definition"], max_length=args.max_target_length, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs # again, with the closures - requires instance of the tokenizer def compute_metrics(eval_pred): predictions, labels = eval_pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Rouge expects a newline after each sentence decoded_preds = [ "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds ] decoded_labels = [ "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels ] result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results result = { key: value.mid.fmeasure * 100 for key, value in result.items() } # Add mean generated length prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions ] result["gen_len"] = np.mean(prediction_lens) return {k: round(v, 4) for k, v in result.items()} tokenized_datasets = datasets.map(preprocess_function, batched=True) model = AutoModelForSeq2SeqLM.from_pretrained(args.model) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) training_args = Seq2SeqTrainingArguments( "./results/summarization", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=args.epochs, predict_with_generate=True, fp16=True, ) trainer = Seq2SeqTrainer(model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["valid"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics) trainer.train()
def test_finetune_bert2bert(self): """ Currently fails with: ImportError: To be able to use this metric, you need to install the following dependencies['absl', 'nltk', 'rouge_score'] """ bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained( "prajjwal1/bert-tiny", "prajjwal1/bert-tiny") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size bert2bert.config.eos_token_id = tokenizer.sep_token_id bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id bert2bert.config.max_length = 128 train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]") val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]") train_dataset = train_dataset.select(range(32)) val_dataset = val_dataset.select(range(16)) rouge = datasets.load_metric("rouge") batch_size = 4 def _map_to_encoder_decoder_inputs(batch): # Tokenizer will automatically set [BOS] <text> [EOS] inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512) outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128) batch["input_ids"] = inputs.input_ids batch["attention_mask"] = inputs.attention_mask batch["decoder_input_ids"] = outputs.input_ids batch["labels"] = outputs.input_ids.copy() batch["labels"] = [[ -100 if token == tokenizer.pad_token_id else token for token in labels ] for labels in batch["labels"]] batch["decoder_attention_mask"] = outputs.attention_mask assert all([len(x) == 512 for x in inputs.input_ids]) assert all([len(x) == 128 for x in outputs.input_ids]) return batch def _compute_metrics(pred): labels_ids = pred.label_ids pred_ids = pred.predictions # all unnecessary tokens are removed pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True) rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid return { "rouge2_precision": round(rouge_output.precision, 4), "rouge2_recall": round(rouge_output.recall, 4), "rouge2_fmeasure": round(rouge_output.fmeasure, 4), } # map train dataset train_dataset = train_dataset.map( _map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"], ) train_dataset.set_format( type="torch", columns=[ "input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels" ], ) # same for validation dataset val_dataset = val_dataset.map( _map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"], ) val_dataset.set_format( type="torch", columns=[ "input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels" ], ) output_dir = self.get_auto_remove_tmp_dir() training_args = Seq2SeqTrainingArguments( output_dir=output_dir, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, predict_with_generate=True, evaluation_strategy="steps", do_train=True, do_eval=True, warmup_steps=0, eval_steps=2, logging_steps=2, ) # instantiate trainer trainer = Seq2SeqTrainer( model=bert2bert, args=training_args, compute_metrics=_compute_metrics, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, ) # start training trainer.train()
# eval_steps=500, # 4 or set to 8000 for full training # warmup_steps=500, # 1 or set to 2000 for full training # max_steps=2500, # 16 or comment for full training # overwrite_output_dir=True, # save_total_limit=3, # fp16=torch.cuda.is_available(), # ) training_args = Seq2SeqTrainingArguments( output_dir='./', evaluation_strategy='steps', per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, predict_with_generate=True, logging_steps=50, # 2 or set to 1000 for full training save_steps=50, # 16 or set to 500 for full training eval_steps=50, # 4 or set to 8000 for full training warmup_steps=50, # 1 or set to 2000 for full training max_steps=850, # 16 or comment for full training overwrite_output_dir=True, save_total_limit=3, fp16=torch.cuda.is_available(), ) # instantiate trainer trainer = Seq2SeqTrainer( model=ed_model, tokenizer=input_tokenizer, args=training_args, compute_metrics=compute_metrics, train_dataset=train_data,
def train_translationmodel(args): dataset_properties = json.load(open(os.path.join(args.data_dir, "dataset_properties.json"))) special_tokens = dataset_properties["special_tokens"] target_vocab = dataset_properties["target_vocab"] target_model = os.path.join(args.model_root_dir, args.run_id, args.target_model_name) output_dir = os.path.join(args.model_root_dir, args.run_id, args.translation_model_name) logging_dir = os.path.join(output_dir, "logs") if args.resume == False: checkpoint = None os.mkdir(output_dir) # copy info about dataset b/c we'll need that when running the dockerized model (among others, it contains the target vocab) copyfile(os.path.join(args.data_dir, "dataset_properties.json"), os.path.join(output_dir, "dataset_properties.json")) else: checkpoint = get_last_checkpoint(output_dir) print(f"trying to resume training from {checkpoint} in {output_dir}") # use mixed precision training on CUDA devices, otherwise disable it so that code can run on CPUs fp16 = True if torch.cuda.is_available() else False bert2arsenal = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", target_model) source_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") source_tokenizer.add_special_tokens({"additional_special_tokens": special_tokens}) # save it for later use s.t. we don't have to download anything for the runtime source_tokenizer.save_pretrained(os.path.join(output_dir, "source_tokenizer")) # only needed to get the id of the EOS token in the target language target_tokenizer = PreTrainedArsenalTokenizer(target_vocab=target_vocab) # Due to the additional special tokens, encoder token embeddings need to be resized. # The target model has been created specifically for the "Effigy Arsenal Language" so has already correct dims bert2arsenal.encoder.resize_token_embeddings(len(source_tokenizer)) bert2arsenal.config.decoder_start_token_id = source_tokenizer.cls_token_id bert2arsenal.config.eos_token_id = target_tokenizer.sep_token_id # not sure whether these settings are relevant? (At least they shouldn't be harmful) bert2arsenal.config.encoder.eos_token_id = source_tokenizer.sep_token_id bert2arsenal.config.decoder.eos_token_id = target_tokenizer.sep_token_id bert2arsenal.config.pad_token_id = source_tokenizer.pad_token_id bert2arsenal.config.vocab_size = bert2arsenal.encoder.vocab_size bert2arsenal.config.encoder.vocab_size = bert2arsenal.encoder.vocab_size # the model has min/max length settings in three places: for the main moder (EncoderDecoder) and both encoder # and decoder as submodels. Settings in the latter two parts seem to be completely irrelevant (unless one would # try to use the trained encoder or decoder parts from the translation model in isolation). bert2arsenal.config.max_length = dataset_properties["decoder_max_len"] bert2arsenal.config.min_length = dataset_properties["decoder_min_len"] # Don't prevent any n-gram repetitions! This would have a significant negative influence on # the translations (especially for longer sentences), because the correct CSTs may contain n-gram repetitions bert2arsenal.config.no_repeat_ngram_size = 0 bert2arsenal.config.early_stopping = True bert2arsenal.config.length_penalty = 2.0 bert2arsenal.config.num_beams = 4 # bert2arsenal.config.add_cross_attention # bert2arsenal.config.num_return_sequences = 5 # this can be used to set the number of return sequences print(f"model config:\n{bert2arsenal.config}") training_args = Seq2SeqTrainingArguments( predict_with_generate=True, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, fp16=fp16, output_dir=output_dir, logging_dir=logging_dir, logging_steps=args.logging_steps, save_steps=args.save_steps, save_total_limit=args.save_total_limit, warmup_steps=args.warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.weight_decay, # strength of weight decay num_train_epochs=args.translation_epochs, ) bert2arsenal.config.to_json_file(os.path.join(output_dir, "model_config.json")) with open(os.path.join(output_dir, "training_args.json"), "w") as f: f.write(str(training_args.to_json_string())) train_data = datasets.Dataset.load_from_disk(os.path.join(args.data_dir, args.train_dataset_name)) train_data.set_format( type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"], ) trainer = Seq2SeqTrainer( model=bert2arsenal, args=training_args, train_dataset=train_data, tokenizer=source_tokenizer ) print(f"start training at {datetime.now().strftime('%b%d_%H-%M-%S')}") trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model()
def main(args): df = pd.read_csv(args.input_fname, encoding='utf-8')[[args.source_lang, args.target_lang]] logging.info(f'Loaded {df.shape}') #convert to dictionary j = {'translation': []} for i in df.itertuples(): j['translation'] += [{args.source_lang: i[1], args.target_lang: i[2]}] train_dataset = Dataset.from_dict(j) raw_datasets = train_dataset.train_test_split(test_size=args.valid_pct, seed=args.seed) logging.info(f'Datasets created {raw_datasets}') tokenizer = MarianTokenizer.from_pretrained(args.output_dir) logging.info(f'Tokenizer loaded from {args.output_dir}') #tokenize datasets tokenized_datasets = raw_datasets.map( partial(preprocess_function, tokenizer=tokenizer, max_input_length=args.max_input_length, max_target_length=args.max_target_length, source_lang=args.source_lang, target_lang=args.target_lang), batched=True, ) logging.info(f'Tokenized datasets: {tokenized_datasets}') #filter those with too few tokens tokenized_datasets = tokenized_datasets.filter( lambda example: len(example['translation']['zh']) > 2) tokenized_datasets = tokenized_datasets.filter( lambda example: len(example['translation']['th']) > 2) logging.info( f'Tokenized datasets when filtered out less than 2 tokens per sequence: {tokenized_datasets}' ) config = MarianConfig.from_pretrained(args.output_dir) model = MarianMTModel(config) logging.info(f'Loaded model from {args.output_dir}') training_args = Seq2SeqTrainingArguments( args.output_dir, evaluation_strategy="epoch", load_best_model_at_end=True, learning_rate=args.learning_rate, warmup_ratio=args.warmup_ratio, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, weight_decay=args.weight_decay, save_total_limit=args.save_total_limit, num_train_epochs=args.num_train_epochs, predict_with_generate=True, fp16=args.fp16, seed=args.seed, ) logging.info(f'Training congig {training_args}') data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) trainer = Seq2SeqTrainer( model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=partial(compute_metrics, tokenizer=tokenizer, metric=metric, metric_tokenize=args.metric_tokenize), ) logging.info(f'Trainer created') trainer.train() model.save_pretrained(f"{args.output_dir}_best") tokenizer.save_pretrained(f"{args.output_dir}_best") logging.info(f'Best model saved') model.cpu() src_text = ['我爱你', '国王有很多心事。我明白'] translated = model.generate( **tokenizer(src_text, return_tensors="pt", padding=True)) print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])
def main(): """Fine-tune on summarization data""" # need this to save a fine-tuned model if os.path.isdir(args.model_dir): shutil.rmtree(args.model_dir) os.mkdir(args.model_dir) # import data provider (e.g. dtr, rel, or events) data = importlib.import_module(args.data_reader) # load pretrained T5 tokenizer tokenizer = T5Tokenizer.from_pretrained(args.model_name) # load a pretrained T5 model model = T5ForConditionalGeneration.from_pretrained(args.model_name) train_dataset = data.Data(xml_dir=args.xml_train_dir, text_dir=args.text_train_dir, out_dir=args.xml_out_dir, xml_regex=args.xml_regex, tokenizer=tokenizer, max_input_length=args.max_input_length, max_output_length=args.max_output_length) test_dataset = data.Data(xml_dir=args.xml_test_dir, text_dir=args.text_test_dir, out_dir=args.xml_out_dir, xml_regex=args.xml_regex, tokenizer=tokenizer, max_input_length=args.max_input_length, max_output_length=args.max_output_length) training_args = Seq2SeqTrainingArguments( output_dir='./Results', num_train_epochs=args.n_epochs, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, learning_rate=args.learning_rate, warmup_steps=100, weight_decay=0.01, logging_dir='./Logs', disable_tqdm=True, predict_with_generate=True, load_best_model_at_end=True) trainer = Seq2SeqTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset) trainer.train() trainer.save_model(args.model_dir) print('done training...') results = trainer.predict(test_dataset=test_dataset, max_length=args.max_output_length, num_beams=1) predictions = tokenizer.batch_decode(results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) for prediction in predictions: print(prediction)
# print(text_tokenizer.convert_ids_to_tokens(inp)) # input_ids = torch.tensor(inp).unsqueeze(0) # Batch size 1 # outp = code_tokenizer.encode('i += 2 ;') # print(outp.tokens) # decoder_input_ids = torch.tensor(outp.ids).unsqueeze(0) # print(input_ids, input_ids.shape) # print(decoder_input_ids, decoder_input_ids.shape) training_args = Seq2SeqTrainingArguments( predict_with_generate=True, evaluation_strategy="steps", per_device_train_batch_size=24, per_device_eval_batch_size=24, fp16=True, output_dir="./checkpoints/", logging_steps=4000, save_steps=1000, eval_steps=4000, warmup_steps=100, save_total_limit=5, ) trainer = Seq2SeqTrainer( model=model, args=training_args, # compute_metrics=compute_metrics, train_dataset=train_data, eval_dataset=eval_data, ) trainer.train(resume_from_checkpoint='./checkpoints-new/checkpoint-3000')
"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1 } args = Seq2SeqTrainingArguments( output_dir="output", evaluation_strategy="steps", weight_decay=0.01, logging_dir='./logs/', logging_steps=100, learning_rate=5e-05, warmup_steps=200, eval_steps=500, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, save_steps=3000, seed=0, load_best_model_at_end=True, predict_with_generate=True, ) trainer = Seq2SeqTrainer( model=model, tokenizer=tokenizer, args=args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics,
def main(): parser = argparse.ArgumentParser() parser.add_argument("--base_model", default='t5-base', type=str, required=False, help="Base model to fine tune.") parser.add_argument("--triples_path", default=None, type=str, required=True, help="Triples.tsv path") parser.add_argument("--output_model_path", default=None, type=str, required=True, help="Path for trained model and checkpoints.") parser.add_argument("--save_every_n_steps", default=0, type=int, required=False, help="Save every N steps. (recommended 10000)") parser.add_argument("--logging_steps", default=100, type=int, required=False, help="Logging steps parameter.") parser.add_argument("--per_device_train_batch_size", default=8, type=int, required=False, help="Per device batch size parameter.") parser.add_argument("--gradient_accumulation_steps", default=16, type=int, required=False, help="Gradient accumulation parameter.") parser.add_argument("--learning_rate", default=3e-4, type=float, required=False, help="Learning rate parameter.") parser.add_argument("--epochs", default=10, type=int, required=False, help="Number of epochs to train") device = torch.device('cuda') torch.manual_seed(123) args = parser.parse_args() model = AutoModelForSeq2SeqLM.from_pretrained(args.base_model) tokenizer = AutoTokenizer.from_pretrained('t5-base') train_samples = [] with open(args.triples_path, 'r', encoding="utf-8") as fIn: for num, line in enumerate(fIn): if num > 6.4e5 * args.epochs: break query, positive, negative = line.split("\t") train_samples.append((query, positive, 'true')) train_samples.append((query, negative, 'false')) def smart_batching_collate_text_only(batch): texts = [example['text'] for example in batch] tokenized = tokenizer(texts, padding=True, truncation='longest_first', return_tensors='pt', max_length=512) tokenized['labels'] = tokenizer( [example['labels'] for example in batch], return_tensors='pt')['input_ids'] for name in tokenized: tokenized[name] = tokenized[name].to(device) return tokenized dataset_train = MonoT5Dataset(train_samples) if args.save_every_n_steps: steps = args.save_every_n_steps strategy = 'steps' else: steps = 1 strategy = 'epoch' train_args = Seq2SeqTrainingArguments( output_dir=args.output_model_path, do_train=True, save_strategy=strategy, save_steps=steps, logging_steps=args.logging_steps, per_device_train_batch_size=args.per_device_train_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, weight_decay=5e-5, num_train_epochs=1, warmup_steps=1000, adafactor=True, seed=1, disable_tqdm=False, load_best_model_at_end=False, predict_with_generate=True, dataloader_pin_memory=False, ) trainer = Seq2SeqTrainer( model=model, args=train_args, train_dataset=dataset_train, tokenizer=tokenizer, data_collator=smart_batching_collate_text_only, ) trainer.train() trainer.save_model(args.output_model_path) trainer.save_state()
def train(args): logger.info("Loading tokenizer...\n") global tokenizer global model_name model_name = args.model_name tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info("Loading pretrained model\n") model = AutoModelForSeq2SeqLM.from_pretrained(model_name) logger.info("Pretrained model loaded\n") logger.info("Fetching and tokenizing data for training") train_dataset = load_and_tokenize_dataset( args.train_data_dir, "train", args.text_column, args.target_column, args.max_source, args.max_target, ) logger.info("Tokenizing data for training loaded") eval_dataset = load_and_tokenize_dataset( args.train_data_dir, "validation", args.text_column, args.target_column, args.max_source, args.max_target, ) test_dataset = load_and_tokenize_dataset( args.train_data_dir, "test", args.text_column, args.target_column, args.max_source, args.max_target, ) logger.info("Defining training arguments\n") training_args = Seq2SeqTrainingArguments( output_dir=args.model_dir, num_train_epochs=args.epoch, per_device_train_batch_size=args.train_batch_size, per_device_eval_batch_size=args.eval_batch_size, learning_rate=args.lr, warmup_steps=args.warmup_steps, weight_decay=args.weight_decay, logging_dir=args.log_dir, logging_strategy=args.logging_strategy, load_best_model_at_end=True, adafactor=True, do_train=True, do_eval=True, do_predict=True, save_total_limit=3, evaluation_strategy="epoch", save_strategy="epoch", predict_with_generate=True, metric_for_best_model="eval_loss", seed=7, ) logger.info("Defining seq2seq Trainer") trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) logger.info("Starting Training") trainer.train() logger.info("Model trained successfully") trainer.save_model() logger.info("Model saved successfully") # Evaluation logger.info("*** Evaluate on test set***") logger.info(trainer.predict(test_dataset)) logger.info("Removing unused checkpoints to save space in container") os.system(f"rm -rf {args.model_dir}/checkpoint-*/")
model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) # model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) batch_size = 16 args = Seq2SeqTrainingArguments( "test-summarization", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=3, predict_with_generate=True, fp16=True, ) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) def compute_metrics(eval_pred): predictions, labels = eval_pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
# print(f'fuse 12: {(fuse_12_weight == fused_model.model.encoder.layers[-1].fuse_layer.weight.data.detach()).all()}') # Freeze M2M layers before 12th encoder layer modules = [fused_model.model.shared, *fused_model.model.encoder.layers[:11]] for module in modules: for param in module.parameters(): param.requires_grad = False # Train batch_size = args.batch_size trainer_args = Seq2SeqTrainingArguments( args.checkpoint_path, evaluation_strategy="steps", learning_rate=args.learning_rate, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=args.weight_decay, save_total_limit=3, num_train_epochs=args.num_train_epochs, predict_with_generate=True, fp16=True, ) data_collator = DataCollatorForSeq2Seq(m2m_tokenizer, model=fused_model) trainer = Seq2SeqTrainer( fused_model, trainer_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['validation'], data_collator=data_collator, tokenizer=m2m_tokenizer, )
def test_finetune_bert2bert(self): bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size bert2bert.config.eos_token_id = tokenizer.sep_token_id bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id bert2bert.config.max_length = 128 train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]") val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]") train_dataset = train_dataset.select(range(32)) val_dataset = val_dataset.select(range(16)) batch_size = 4 def _map_to_encoder_decoder_inputs(batch): # Tokenizer will automatically set [BOS] <text> [EOS] inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512) outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128) batch["input_ids"] = inputs.input_ids batch["attention_mask"] = inputs.attention_mask batch["decoder_input_ids"] = outputs.input_ids batch["labels"] = outputs.input_ids.copy() batch["labels"] = [ [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"] ] batch["decoder_attention_mask"] = outputs.attention_mask assert all([len(x) == 512 for x in inputs.input_ids]) assert all([len(x) == 128 for x in outputs.input_ids]) return batch def _compute_metrics(pred): labels_ids = pred.label_ids pred_ids = pred.predictions # all unnecessary tokens are removed pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True) accuracy = sum([int(pred_str[i] == label_str[i]) for i in range(len(pred_str))]) / len(pred_str) return {"accuracy": accuracy} # map train dataset train_dataset = train_dataset.map( _map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"], ) train_dataset.set_format( type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"], ) # same for validation dataset val_dataset = val_dataset.map( _map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"], ) val_dataset.set_format( type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"], ) output_dir = self.get_auto_remove_tmp_dir() training_args = Seq2SeqTrainingArguments( output_dir=output_dir, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, predict_with_generate=True, evaluation_strategy="steps", do_train=True, do_eval=True, warmup_steps=0, eval_steps=2, logging_steps=2, ) # instantiate trainer trainer = Seq2SeqTrainer( model=bert2bert, args=training_args, compute_metrics=_compute_metrics, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, ) # start training trainer.train()
for i, pred in enumerate(predictions)] references = [{'id': str(i), 'reference': ref.strip().lower()} \ for i, ref in enumerate(references)]''' model = MT5ForConditionalGeneration.from_pretrained('mt5small') '''device = torch.device("cpu") model.to(device) print(next(model.parameters()).device)''' training_args = Seq2SeqTrainingArguments( output_dir='./results', num_train_epochs=NUM_EPOCHS, per_device_train_batch_size=TRAIN_BATCH_SIZE, per_device_eval_batch_size=EVAL_BATCH_SIZE, warmup_steps=WARMUP_STEPS, gradient_accumulation_steps=8, # weight_decay=WEIGHT_DECAY, logging_dir='./logs/', evaluation_strategy="epoch", logging_steps=LOGGING_STEPS, learning_rate=LEARNING_RATE, predict_with_generate=True, ) model.get_output_embeddings().weight.requires_grad=False model.get_input_embeddings().weight.requires_grad=False trainer = Seq2SeqTrainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=dataset['train'],