def train(): """Trains a BERT ethicality classifer.""" args = transformers.TrainingArguments( "saved_models", evaluation_strategy="epoch", learning_rate=config['learning_rate'], per_device_train_batch_size=config['batch_size'], per_device_eval_batch_size=config['batch_size'], num_train_epochs=config['num_epochs'], weight_decay=config['weight_decay'], load_best_model_at_end=True, metric_for_best_model="f1") train, val, test = get_train_val_test_datasets() trainer = transformers.Trainer(model=get_model(), args=args, train_dataset=train, eval_dataset=val, compute_metrics=metrics) # Train the model. trainer.train() # Display model eval statistics. print(trainer.evaluate()) # Test dataset metrics. trainer.predict(test).metrics
def _get_train_args(self, nepochs: int, eval_every: int, batch_size: int, save_every: int ) -> transformers.TrainingArguments: training_arguments = transformers.TrainingArguments( output_dir=self.checkpoints_dir, overwrite_output_dir=True, do_train=True, do_eval=True, evaluation_strategy="steps", eval_steps=eval_every, save_steps=save_every, num_train_epochs=nepochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, save_total_limit=3 ) return training_arguments
metrics[f"{average}_f1"] = f1 metrics["accuracy"] = sklearn.metrics.accuracy_score(labels, predictions) return metrics _dir = pathlib.Path().resolve() / uuid.uuid4().hex _dir.mkdir() _dir = str(_dir) args = transformers.TrainingArguments( output_dir=f"{_dir}/output", num_train_epochs=32, per_device_train_batch_size=4, per_device_eval_batch_size=8, logging_dir=f"{_dir}/logging", logging_steps=256, dataloader_num_workers=64, evaluation_strategy="steps", eval_steps=256, save_steps=256, fp16=True, fp16_opt_level="O3", learning_rate=5e-4, run_name=_dir, ) model = transformers.AlbertForSequenceClassification.from_pretrained( "albert-large-v2", num_labels=2) tokenizer = transformers.AlbertTokenizerFast.from_pretrained("albert-large-v2") data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=32) trainer = transformers.Trainer( args=args,
import argparse import transformers parser = argparse.ArgumentParser() parser.add_argument('--vocab', type=str) parser.add_argument('--model', type=str) parser.add_argument('--data', type=str) args = parser.parse_args() tokenizer = transformers.BertTokenizer(vocab_file=args.vocab, do_lower_case=False, do_basic_tokenize=True) model = transformers.BertForMaskedLM.from_pretrained(args.model) dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer, file_path=args.data, block_size=128) data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15) train_args = transformers.TrainingArguments( per_device_eval_batch_size=16, output_dir=f"/tmp/echau18/{args.model}") trainer = transformers.Trainer(model=model, eval_dataset=dataset, data_collator=data_collator, prediction_loss_only=True, args=train_args) eval_output = trainer.evaluate() print(eval_output)
def main(): parser = ArgumentParser() parser.add_argument('--corpus_dir', required=True) parser.add_argument('--text_column') parser.add_argument('--model_name') parser.add_argument('--max_seq_length', type=int) parser.add_argument('--num_epochs', type=int) parser.add_argument('--learning_rate', type=float, default=1e-5) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--output_dir') args = parser.parse_args() torch.manual_seed(42) logging.basicConfig(level=logging.INFO) corpus_dir = args.corpus_dir text_column_name = args.text_column model_name = args.model_name max_seq_length = args.max_seq_length num_epochs = args.num_epochs learning_rate = args.learning_rate batch_size = args.batch_size output_dir = args.output_dir train_df_1 = pd.read_csv(os.path.join( corpus_dir, "train_{}.csv".format("sentiments_cloudvision")), encoding="utf-8") print("Train_1", train_df_1.shape) train_df_1.dropna(subset=[text_column_name], inplace=True) print("Train_1", train_df_1.shape) val_df_1 = pd.read_csv(os.path.join( corpus_dir, "val_{}.csv".format("sentiments_cloudvision")), encoding="utf-8") print("Val_1", val_df_1.shape) val_df_1.dropna(subset=[text_column_name], inplace=True) print("Val_1", val_df_1.shape) train_df_2 = pd.read_csv(os.path.join( corpus_dir, "train_{}.csv".format("topics_cloudvision")), encoding="utf-8") print("Train_2", train_df_2.shape) train_df_2.dropna(subset=[text_column_name], inplace=True) print("Train_2", train_df_2.shape) val_df_2 = pd.read_csv(os.path.join( corpus_dir, "val_{}.csv".format("topics_cloudvision")), encoding="utf-8") print("Val_2", val_df_2.shape) val_df_2.dropna(subset=[text_column_name], inplace=True) print("Val_2", val_df_2.shape) train_dfs = {"task_1": train_df_1, "task_2": train_df_2} val_dfs = {"task_1": val_df_1, "task_2": val_df_2} dataset_dict_1, id_to_class_1 = load_dataset(train_df_1, val_df_1, text_column_name) dataset_dict_2, id_to_class_2 = load_dataset(train_df_2, val_df_2, text_column_name) classes_list_1 = [] for i in range(len(id_to_class_1.keys())): class_label = id_to_class_1[i] classes_list_1.append(class_label) classes_list_2 = [] for i in range(len(id_to_class_2.keys())): class_label = id_to_class_2[i] classes_list_2.append(class_label) dataset_dict = {"task_1": dataset_dict_1, "task_2": dataset_dict_2} id_to_class_dicts = {"task_1": id_to_class_1, "task_2": id_to_class_2} id_to_class = {"task_1": classes_list_1, "task_2": classes_list_2} multitask_model = MultitaskModel.create( model_name=model_name, model_type_dict={ "task_1": transformers.AutoModelForSequenceClassification, "task_2": transformers.AutoModelForSequenceClassification, }, model_config_dict={ "task_1": transformers.AutoConfig.from_pretrained( model_name, num_labels=len(id_to_class_dicts["task_1"].keys())), "task_2": transformers.AutoConfig.from_pretrained( model_name, num_labels=len(id_to_class_dicts["task_2"].keys())), }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) feature_fn = convert_features_function(tokenizer, max_seq_length) convert_func_dict = { "task_1": feature_fn, "task_2": feature_fn, } columns_dict = { "task_1": ['input_ids', 'attention_mask', 'labels'], "task_2": ['input_ids', 'attention_mask', 'labels'], } features_dict = data_to_features(dataset_dict, convert_func_dict, columns_dict) train_dataset = { task_name: dataset["train"] for task_name, dataset in features_dict.items() } val_dataset_dict = { task_name: dataset["validation"] for task_name, dataset in features_dict.items() } trainer = MultitaskTrainer( model=multitask_model, args=transformers.TrainingArguments( output_dir="./models/multitask_model", overwrite_output_dir=True, learning_rate=learning_rate, do_train=True, num_train_epochs=num_epochs, # Adjust batch size if this doesn't fit on the Colab GPU per_device_train_batch_size=batch_size, save_steps=3000, ), # compute_metrics=classification_metrics, data_collator=NLPDataCollator(), train_dataset=train_dataset, eval_dataset=val_dataset_dict) trainer.train() validation_results = evaluate_classification(trainer, features_dict, dataset_dict) for task_name, results_dict in validation_results.items(): for metric_name, value in results_dict.items(): print(f"Validation quality: After training, task: {task_name}," f" {metric_name} : {value}") training_results = evaluate_classification(trainer, features_dict, dataset_dict, collection="train") for task_name, results_dict in training_results.items(): for metric_name, value in results_dict.items(): print(f"Training quality: After training, task: {task_name}," f" {metric_name} : {value}") validation_predictions = get_predictions(trainer, features_dict, id_to_class, collection="validation") train_predictions = get_predictions(trainer, features_dict, id_to_class, collection="train") # print("Pred train", train_predictions.shape) # print("Pred val", validation_predictions.shape) # train_embeddings = get_last_layer_embedding(multitask_model, trainer, features_dict, collection="train") # validation_embeddings = get_last_layer_embedding(multitask_model, trainer, features_dict, collection="validation") train_embeddings = get_embeddings( multitask_model, features_dict, collection="train", ) validation_embeddings = get_embeddings( multitask_model, features_dict, collection="validation", ) # print("Embe train", train_embeddings.shape) # print("Embe val", validation_embeddings.shape) for task_name in ["task_1", "task_2"]: train_df = train_dfs[task_name] prediction_df = train_predictions[task_name] cls_emb_df = train_embeddings[task_name]["cls"] mean_emb_df = train_embeddings[task_name]["mean"] train_df = pd.concat( [train_df, prediction_df, cls_emb_df, mean_emb_df], axis=1, ) output_path = os.path.join(output_dir, task_name, "train.csv") d = os.path.dirname(output_path) if not os.path.exists(d): os.makedirs(d) prediction_df.to_csv(os.path.join(output_dir, task_name, "tr_prediction.csv"), encoding="utf-8", index=False) cls_emb_df.to_csv(os.path.join(output_dir, task_name, "tr_cls_emb.csv"), encoding="utf-8", index=False) mean_emb_df.to_csv(os.path.join(output_dir, task_name, "tr_mean_emb.csv"), encoding="utf-8", index=False) train_df.to_csv(output_path, encoding="utf-8", index=False) val_df = val_dfs[task_name] prediction_df = validation_predictions[task_name] cls_emb_df = validation_embeddings[task_name]["cls"] mean_emb_df = validation_embeddings[task_name]["mean"] val_df = pd.concat([val_df, prediction_df, cls_emb_df, mean_emb_df], axis=1) output_path = os.path.join(output_dir, task_name, "val.csv") d = os.path.dirname(output_path) if not os.path.exists(d): os.makedirs(d) prediction_df.to_csv(os.path.join(output_dir, task_name, "val_prediction.csv"), encoding="utf-8", index=False) cls_emb_df.to_csv(os.path.join(output_dir, task_name, "val_cls_emb.csv"), encoding="utf-8", index=False) mean_emb_df.to_csv(os.path.join(output_dir, task_name, "val_mean_emb.csv"), encoding="utf-8", index=False) val_df.to_csv(output_path, encoding="utf-8", index=False)
from utils import get_timestamp logging.basicConfig(level=logging.ERROR) torch.manual_seed(42) trainer = MultitaskTrainer( model=multitask_model, args=transformers.TrainingArguments( output_dir="./models/multitask_model", overwrite_output_dir=True, learning_rate=2e-5, do_train=True, do_eval=True, # evaluation_strategy ="steps", num_train_epochs=epochs, fp16=True, # Adjust batch size if this doesn't fit on the Colab GPU per_device_train_batch_size=32, per_device_eval_batch_size=32, save_steps=3000, # eval_steps=50, load_best_model_at_end=True, ), data_collator=NLPDataCollator(tokenizer=tokenizer), train_dataset=train_dataset, eval_dataset=validation_dataset, callbacks=[] ) # train the model trainer.train()
) return data_loader if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_dir') default_training_args = vars( transformers.TrainingArguments( output_dir="./models/rubert_cased_nplus1", overwrite_output_dir=True, do_train=True, do_eval=True, per_device_train_batch_size=32, per_device_eval_batch_size=128, num_train_epochs=5, learning_rate=2e-5, logging_steps=500, logging_first_step=True, save_steps=1000, evaluate_during_training=True, )) for k, v in default_training_args.items(): parser.add_argument('--' + k, default=v, type=type(v)) args = parser.parse_args() training_args_dict = { k: v for k, v in vars(args).items() if k in default_training_args } data_dir = args.data_dir
from data import features_dict ##UPDATE THIS multitask_model.load_state_dict(torch.load("src/models/{}/pytorch_model.bin")) trainer = MultitaskTrainer( model=multitask_model, args=transformers.TrainingArguments( learning_rate=learning_rate, output_dir="/tmp", do_train=False, do_eval=True, # evaluation_strategy ="steps", num_train_epochs=epochs, fp16=True, # Adjust batch size if this doesn't fit on the Colab GPU per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, save_steps=3000, # eval_steps=50, load_best_model_at_end=True, ), data_collator=NLPDataCollator(tokenizer=tokenizer), callbacks=[]) tests_dict = {} for task_name in ["document", "paragraph", "sentence"]: test_dataloader = DataLoaderWithTaskname( task_name, trainer.get_eval_dataloader(features_dict[task_name]["test"]))
pad_token_id=t_tokenizer.pad_token_id, bos_token_id=t_tokenizer.bos_token_id, eos_token_id=t_tokenizer.eos_token, sep_token_id=t_tokenizer.sep_token_id) # 创建Albert语言模型 albert_model = AutoModelForMaskedLM.from_config(albert_config) # albert_model = AlbertForMaskedLM.from_pretrained("/home/hedan/tools/Github/NLP_Based_Transformer/model/checkpoint-5000") # albert_model.resize_token_embeddings(len(t_tokenizer)) # 配置训练参数 train_args = transformers.TrainingArguments(output_dir="./model", do_train=True, logging_steps=50, learning_rate=0.001, num_train_epochs=30, save_steps=1000, per_device_train_batch_size=32, lr_scheduler_type="polynomial", dataloader_num_workers=4) # t = t_DataCollator(h["input_ids"]) # x = albert_model(torch.tensor(h["input_ids"])) # 训练 trainer = Trainer(model=albert_model, args=train_args, train_dataset=t_dataset, tokenizer=t_tokenizer, data_collator=t_DataCollator) trainer.train()
n_layer=3, n_head=3 ) model = transformers.GPT2LMHeadModel(config=config) print("Training Model...") writer = SummaryWriter() training_args = transformers.TrainingArguments( output_dir="models/gpt2/", do_train=True, do_eval=True, evaluate_during_training=True, per_device_train_batch_size=32, per_device_eval_batch_size=32, num_train_epochs=1, logging_first_step=True, save_steps=2000, save_total_limit=2, ) trainer = transformers.Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_set, eval_dataset=valid_set, prediction_loss_only=True, tb_writer=writer )
import torch import transformers from cfg import config from data import get_train_val_test_datasets from models import get_model from utils import metrics args = transformers.TrainingArguments( "saved_models", evaluation_strategy = "epoch", learning_rate=config['learning_rate'], per_device_train_batch_size=config['batch_size'], per_device_eval_batch_size=config['batch_size'], num_train_epochs=config['num_epochs'], weight_decay=config['weight_decay'], load_best_model_at_end=True, metric_for_best_model="f1" ) train, val, test = get_train_val_test_datasets() trainer = transformers.Trainer(model=get_model(), args=args, train_dataset=train, eval_dataset=val, compute_metrics=metrics) # Train the model. trainer.train() # Display model eval statistics. print(trainer.evaluate()) # Test dataset metrics.
def main(): args = get_args() dataset_dict = { "stsb": nlp.load_dataset('glue', name="stsb"), "rte": nlp.load_dataset('glue', name="rte"), "commonsense_qa": nlp.load_dataset('commonsense_qa'), } for task_name, dataset in dataset_dict.items(): print(task_name) print(dataset_dict[task_name]["train"][0]) print() multitask_model = MultitaskModel.create( model_name=model_name, model_type_dict={ "stsb": transformers.AutoModelForSequenceClassification, "rte": transformers.AutoModelForSequenceClassification, "commonsense_qa": transformers.AutoModelForMultipleChoice, }, model_config_dict={ "stsb": transformers.AutoConfig.from_pretrained(model_name, num_labels=1), "rte": transformers.AutoConfig.from_pretrained(model_name, num_labels=2), "commonsense_qa": transformers.AutoConfig.from_pretrained(model_name), }) if model_name.startswith("roberta-"): print(multitask_model.encoder.embeddings.word_embeddings.weight. data_ptr()) print(multitask_model.taskmodels_dict["stsb"].roberta.embeddings. word_embeddings.weight.data_ptr()) print(multitask_model.taskmodels_dict["rte"].roberta.embeddings. word_embeddings.weight.data_ptr()) print(multitask_model.taskmodels_dict["commonsense_qa"].roberta. embeddings.word_embeddings.weight.data_ptr()) convert_func_dict = { "stsb": convert_to_stsb_features, "rte": convert_to_rte_features, "commonsense_qa": convert_to_commonsense_qa_features, } columns_dict = { "stsb": ['input_ids', 'attention_mask', 'labels'], "rte": ['input_ids', 'attention_mask', 'labels'], "commonsense_qa": ['input_ids', 'attention_mask', 'labels'], } features_dict = {} for task_name, dataset in dataset_dict.items(): features_dict[task_name] = {} for phase, phase_dataset in dataset.items(): features_dict[task_name][phase] = phase_dataset.map( convert_func_dict[task_name], batched=True, load_from_cache_file=False, ) print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase])) features_dict[task_name][phase].set_format( type="torch", columns=columns_dict[task_name], ) print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase])) train_dataset = { task_name: dataset["train"] for task_name, dataset in features_dict.items() } trainer = MultitaskTrainer( model=multitask_model, args=transformers.TrainingArguments( output_dir=args.job_dir, overwrite_output_dir=True, learning_rate=1e-5, do_train=True, num_train_epochs=3, per_device_train_batch_size=args.batch_size, save_steps=3000, ), data_collator=NLPDataCollator(), train_dataset=train_dataset, ) trainer.train() preds_dict = {} for task_name in ["rte", "stsb", "commonsense_qa"]: eval_dataloader = DataLoaderWithTaskname( task_name, trainer.get_eval_dataloader( eval_dataset=features_dict[task_name]["validation"])) print(eval_dataloader.data_loader.collate_fn) preds_dict[task_name] = trainer._prediction_loop( eval_dataloader, description=f"Validation: {task_name}", ) # Evalute RTE nlp.load_metric('glue', name="rte").compute( np.argmax(preds_dict["rte"].predictions, axis=1), preds_dict["rte"].label_ids, ) # Evalute STS-B nlp.load_metric('glue', name="stsb").compute( preds_dict["stsb"].predictions.flatten(), preds_dict["stsb"].label_ids, ) # Evalute Commonsense QA np.mean( np.argmax(preds_dict["commonsense_qa"].predictions, axis=1) == preds_dict["commonsense_qa"].label_ids)
transformers.logging.set_verbosity_debug() if __name__ == '__main__': # Load data dlnd_train_dset, dlnd_valid_dset, dlnd_test_dset = DlndData( ).return_datasets() # Load model model = create_model() # Training training_args = transformers.TrainingArguments( evaluation_strategy='epoch', load_best_model_at_end=True, logging_dir='training_logs', logging_first_step=True, logging_steps=10, num_train_epochs=10, output_dir='training_results', per_device_eval_batch_size=BATCH_SIZE, per_device_train_batch_size=BATCH_SIZE, weight_decay=0.01, metric_for_best_model='accuracy', disable_tqdm=True, ) trainer = transformers.Trainer( model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=dlnd_train_dset, eval_dataset=dlnd_valid_dset, callbacks=[LogCallback], ) trainer.train()
def run_training(args, train_data): ## Checkpoint Loading ######################################################## if args.load: if '2700' in args.load: model = transformers.GPTNeoForCausalLM.from_pretrained(args.load) else: model = transformers.GPT2LMHeadModel.from_pretrained(args.load) print(f"Loaded model from {args.load}") else: if "EleutherAI" in args.arch: model = transformers.GPTNeoForCausalLM.from_pretrained(args.arch) else: model = transformers.GPT2LMHeadModel.from_pretrained(args.arch) if args.resume: raise NotImplementedError model = transformers.GPT2LMHeadModel.from_pretrained(args.resume) print(f"Loaded model from {args.resume}") start_epoch = 0 start_iteration = int(args.resume.split("-")[-1]) print("start_iteration = ", start_iteration) else: start_iteration = 0 ## Dataloading ######################################################## train_data.start_iteration = start_iteration ## Start Loop ######################################################## print(f"Starting main loop") training_args = transformers.TrainingArguments( output_dir=args.save_dir, overwrite_output_dir=False, do_train=True, do_eval=False, do_predict=True, evaluation_strategy='no', eval_steps=0, num_train_epochs=args.epochs, per_device_train_batch_size=args.batch_size_per_replica, gradient_accumulation_steps=args.grad_acc_steps, learning_rate=args.lr, weight_decay=0.05, # warmup_steps=args.lr_warmup_steps, # max_grad_norm=100000.0, logging_dir=args.save_dir, logging_first_step=True, logging_steps=args.log_freq, save_steps=args.save_freq, save_total_limit=2, dataloader_drop_last=True, dataloader_num_workers=3, local_rank=args.local_rank, deepspeed=args.deepspeed, fp16=args.fp16, ) trainer = transformers.Trainer( model=model, args=training_args, train_dataset=train_data, ) trainer.remove_callback(transformers.integrations.TensorBoardCallback) trainer.add_callback(CustomTensorBoardCallback()) trainer.train() if args.local_rank == 0: model.save_pretrained(os.path.join(args.save_dir, "final_checkpoint"))
def train(args): logging.basicConfig(level=logging.INFO) tokenizer = transformers.AlbertTokenizer.from_pretrained( 'albert-base-v2', cache_dir=cache_dir) albert_for_math_config = transformers.AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) if args['--load']: model = transformers.AlbertForMaskedLM.from_pretrained( args['--load-from']) training_args = transformers.TrainingArguments( output_dir=args['--save-to'], overwrite_output_dir=True, num_train_epochs=int(args['--max-epoch']), per_gpu_train_batch_size=int(args['--batch-size']), per_gpu_eval_batch_size=int(args['--batch-size']), logging_steps=int(args['--log-every']), save_steps=int(args['--save-every']), save_total_limit=10, learning_rate=float(args['--lr']), seed=int(args['--seed']), ) else: model = transformers.AlbertForMaskedLM(albert_for_math_config) training_args = transformers.TrainingArguments( output_dir=args['--save-to'], num_train_epochs=int(args['--max-epoch']), per_gpu_train_batch_size=int(args['--batch-size']), per_gpu_eval_batch_size=int(args['--batch-size']), logging_steps=int(args['--log-every']), save_steps=int(args['--save-every']), save_total_limit=10, learning_rate=float(args['--lr']), seed=int(args['--seed']), ) #load datasets print('Loading Data...') train_data = torch.load( './data/train_data_train-easy_algebra__linear_1d.pt') dev_data = torch.load('./data/dev_data_train-easy_algebra__linear_1d.pt') print('Finished loading data') device = torch.device("cuda:0" if args['--cuda'] else "cpu") model.to(device) trainer = transformers.Trainer( model=model, args=training_args, data_collator=AnswerMaskDataCollator(tokenizer), train_dataset=train_data, eval_dataset=dev_data, prediction_loss_only=True, ) if args['--load']: trainer.train(model_path=args['--load-from']) else: trainer.train()
pd.read_json(f"{args.data_dir}/val.jsonl", lines=True, orient="records"), test_size=0.5, ) tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") train_data = boolq.BoolQDataset(train_df, tokenizer) val_data = boolq.BoolQDataset(val_df, tokenizer) test_data = boolq.BoolQDataset(test_df, tokenizer) ## TODO: Initialize a transformers.TrainingArguments object here for use in ## training and tuning the model. Consult the assignment handout for some ## sample hyperparameter values. training_arg = transformers.TrainingArguments(num_train_epochs=8, learning_rate=5e-5, output_dir='scratch/adv312/', evaluation_strategy="epoch", per_device_train_batch_size=8) ## TODO: Initialize a transformers.Trainer object and run a Bayesian ## hyperparameter search for at least 5 trials (but not too many) on the ## learning rate. Hint: use the model_init() and ## compute_metrics() methods from finetuning_utils.py as arguments to ## Trainer(). trainer = transformers.Trainer( model_init=finetuning_utils.model_init, args=training_arg, compute_metrics=finetuning_utils.compute_metrics, train_dataset=train_data)
def run_training(args, train_data): if not args.save_steps: # Save every epoch if not args.tpu_num_cores: save_steps = len(train_data) save_steps = int(save_steps / torch.cuda.device_count()) save_steps = int(save_steps / args.grad_acc_steps) save_steps = int(save_steps / args.batch_size_per_replica) else: save_steps = len(train_data) save_steps = int(save_steps / 8) # 8 TPU cores is constant for now. save_steps = int(save_steps / args.grad_acc_steps) save_steps = int(save_steps / args.batch_size_per_replica) else: save_steps = args.save_steps print("Save Steps = ", save_steps) ## Checkpoint Loading ######################################################## if args.load: model = transformers.GPT2LMHeadModel.from_pretrained(args.load) print(f"Loaded model from {args.load}") else: model = transformers.GPT2LMHeadModel.from_pretrained(args.arch) start_epoch = 0 start_iteration = 0 ## Dataloading ######################################################## train_data.start_iteration = start_iteration ## Start Loop ######################################################## print(f"Setting up Trainer") training_args = transformers.TrainingArguments( output_dir=args.save_dir, overwrite_output_dir=False, do_train=True, do_eval=False, do_predict=True, evaluation_strategy='no', eval_steps=0, num_train_epochs=args.epochs, per_device_train_batch_size=args.batch_size_per_replica, gradient_accumulation_steps=args.grad_acc_steps, learning_rate=args.lr, weight_decay=args.weight_decay, warmup_steps=args.lr_warmup_steps, max_grad_norm=100000.0, # Essentially disable gradient clipping logging_dir=args.save_dir, logging_first_step=True, logging_steps=args.log_freq, save_steps=save_steps, save_total_limit=10, # Only save the last epoch dataloader_drop_last=True, dataloader_num_workers=args.dataloader_num_workers, local_rank=args.local_rank, tpu_num_cores=args.tpu_num_cores, ) trainer = GPT2Trainer( model=model, args=training_args, train_dataset=train_data, ) trainer.remove_callback(transformers.integrations.TensorBoardCallback) trainer.add_callback(CustomTensorBoardCallback()) print(f"STARTING TRAINING. save_steps={save_steps}") trainer.train() trainer.save_model(os.path.join(args.save_dir, "final_checkpoint")) print("Finished")
def train_bert(corpus_path, hebrew_model=False): """ Bert model training :param corpus_path: Corpus to train Bert on :param hebrew_model: Model in Hebrew or not :return: The name of the new trained model """ language = 'hebrew' if hebrew_model else 'english' df = pd.read_csv(corpus_path) corpus_name = get_corpus_name(corpus_path) print("Preprocess...") if hebrew_model: model_name, vocab, raw_text_file = preprocess_hebrew(df, corpus_name) else: model_name, vocab, raw_text_file = preprocess_english(df, corpus_name) pass print("Cuda availability :", torch.cuda.is_available()) print("Getting tokenizer...") tokenizer = transformers.AutoTokenizer.from_pretrained(conf.bert_model[language], use_fast=True) model = transformers.AutoModelForMaskedLM.from_pretrained(conf.bert_model[language]).to('cuda') tokenizer.add_tokens(vocab) model.resize_token_embeddings(len(tokenizer)) if os.path.exists(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name): shutil.rmtree(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) os.mkdir(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) tokenizer.save_pretrained(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) print("Tokenizing...") dataset = transformers.LineByLineTextDataset( tokenizer=tokenizer, file_path=raw_text_file, block_size=128, ) data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) training_args = transformers.TrainingArguments( output_dir=conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name, overwrite_output_dir=True, num_train_epochs=20, per_device_train_batch_size=16, save_steps=300, logging_steps=100, save_total_limit=3, ) trainer = transformers.Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset ) print("Begin training...") os.environ["TOKENIZERS_PARALLELISM"] = "false" trainer.train() trainer.save_model(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) print('The model has been saved under : ', conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) return conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name
def testModel(self, train_val_split_iterator: typing.Iterator = [ sklearn.model_selection.train_test_split ], **kwargs): logger.info("Starting testing of RobertaModel") num_epochs = kwargs['epochs'] batch_size = kwargs['batch_size'] for i, train_test_split in enumerate(train_val_split_iterator): logger.debug( f'{i}-th enumeration of train_val split iterator under cross validation' ) self.model = self.createModel() # optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) # loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) if callable(getattr(self.model, 'compile', None)): # if tf model train_dataset, val_dataset = self.pipeLine.getEncodedDataset( train_test_split, batch_size=batch_size) # self.model.compile(optimizer=optimizer, loss=loss, metrics=self._registeredMetrics) # self.model.fit(train_dataset, epochs=num_epochs) training_args = transformers.TFTrainingArguments( output_dir= f'./results/{self._modelName}', # output directory num_train_epochs= num_epochs, # total number of training epochs per_device_train_batch_size= batch_size, # batch size per device during training per_device_eval_batch_size= batch_size, # batch size for evaluation warmup_steps=kwargs[ 'warmup_steps'], # number of warmup steps for learning rate scheduler weight_decay=kwargs[ 'weight_decay'], # strength of weight decay logging_dir='./logs', # directory for storing logs ) trainer = transformers.TFTrainer( model=self. model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset= train_dataset, # tensorflow_datasets training dataset eval_dataset= val_dataset, # tensorflow_datasets evaluation dataset compute_metrics=get_compute_metrics( self._registeredMetrics ) # metrics to compute while training ) else: # if pytorch model train_dataset, val_dataset = self.pipeLine.getEncodedDataset( train_test_split, batch_size=batch_size, tfOrPyTorch=torchOrTFEnum.TORCH) training_args = transformers.TrainingArguments( output_dir= f'./results/{self._modelName}', # output directory num_train_epochs= num_epochs, # total number of training epochs per_device_train_batch_size= batch_size, # batch size per device during training per_device_eval_batch_size= batch_size, # batch size for evaluation warmup_steps=kwargs[ 'warmup_steps'], # number of warmup steps for learning rate scheduler weight_decay=kwargs[ 'weight_decay'], # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10, ) trainer = transformers.Trainer( model=self. model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset, # evaluation dataset compute_metrics=get_compute_metrics( self._registeredMetrics ) # metrics to compute while training ) trainer.train()