def main(path, valid_in_cat_path, valid_out_of_cat_path): steam_df = load_steam_data() i = 1 print("starting training, using fold " + str(i)) train, test = load_fold_data(path, i) # Train the model using roberta model args_dict = {'output_dir': '../../models/roberta-base-bs8-e6-fold' + str(i), 'use_cached_eval_features': False, 'reprocess_input_data': True, 'train_batch_size': 8, 'num_train_epochs': 6, 'fp16': False, 'overwrite_output_dir': True} model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=args_dict) model.train_model(train) print("done training model fold " + str(i)) result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score, f1=f1_score) acc = result['acc'] f1 = result['f1'] print(f"acc: {acc} , f1: {f1}") # Make predictions with the model save_path = '../../reports/steam-prediction.csv' print("predicting...") predictions, raw_outputs = model.predict(steam_df["sentence"].tolist()) print(f"predicting finished - saved to {save_path}" ) steam_df['prediction'] = predictions steam_df.to_csv(save_path, index=False)
def fake_classify(train_set, eval_set, test_set, seed): # Create a TransformerModel model = ClassificationModel('bert', 'bert-base-multilingual-uncased', args={ 'max_seq_length': 512, 'num_train_epochs': 3, 'overwrite_output_dir': True, 'manual_seed': seed }, use_cuda=True) print(model.args) # Train the model model.train_model(train_set) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_set, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score) #print('Evaluation results = ', results(results)) return result, model_outputs, wrong_predictions
def train(self, train_data: object, eval_data: object) -> object: """ Create and train the chosen model based on the args Parameters ---------- train_data : object train split of the train_data. eval_data : object validation split of the train_data. Returns ------- object model. """ # Create a ClassificationModel model = ClassificationModel( self.model_name, self.model_type, args=self.model_args, use_cuda=self.cuda, num_labels=len(self.labels) - 1, ) # Train the model model.train_model(train_df=train_data, eval_df=eval_data, accuracy=accuracy_score) return model
def train(train_df, max_sub_len, output_dir): model_type = 'distilbert' lr = 2e-5 sent_length = max_sub_len OUTPUT_DIR = output_dir \ + str(datetime.datetime.now())[:19] + '_' + model_type + '_' + str(sent_length) + '_' + str(lr) print("model is saved at: {}".format(OUTPUT_DIR)) training_config = { 'output_dir': OUTPUT_DIR, 'reprocess_input_data': True, 'overwrite_output_dir': True, 'num_train_epochs': 2, 'train_batch_size': 32, 'eval_batch_size': 32, 'learning_rate': lr, 'max_seq_length': sent_length } logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model = ClassificationModel(model_type, 'distilbert-base-cased', num_labels=4, args=training_config) torch.cuda.empty_cache() model.train_model(train_df) return model
def train_model(train_df, num_labels): model_type, model_name = MODELNAME.split(";") model_output = 'models/{}-{}-{}'.format(TAG, model_type, model_name.replace("/", "-")) if OVERWRITE is False and os.path.exists(model_output): logging.info("Skipping training of {}".format(model_name)) sys.exit(0) logging.info("Starting training of {}".format(model_name)) run = wandb.init(project=model_output.split("/")[-1], reinit=True) model = ClassificationModel( model_type, model_name, num_labels=num_labels, args={ 'output_dir': model_output, 'overwrite_output_dir': OVERWRITE, 'best_model_dir': '{}/best'.format(model_output), 'evaluate_during_training': False, 'manual_seed': 42, 'num_train_epochs': 4, # 'learning_rate': 2e-5, # For BERT, 5e-5, 3e-5, 2e-5 # For BERT 16, 32. It could be 128, but with gradient_acc_steps set to 2 is equivalent 'train_batch_size': 8 if "large" in model_name else 32, 'eval_batch_size': 8 if "large" in model_name else 32, # Doubles train_batch_size, but gradients and weights are calculated once every 2 steps 'gradient_accumulation_steps': 2 if "large" in model_name else 1, 'max_seq_length': 256, 'sliding_window': False, 'wandb_project': model_output.split("/")[-1], # "adam_epsilon": 3e-5, # 1e-8 "silent": False, "fp16": False, # By default it uses 32 bit floating point "n_gpu": 1, }) # train the model model.train_model(train_df) return model, run
class TransformerModel: """ This class provides the Machine Learning model and classifies tenders based on previous training data. """ def load_model(self): if not self.model: from simpletransformers.classification import ClassificationModel try: self.model = ClassificationModel('bert', './outputs/', use_cuda=False, args=args) except Exception as ex: logger.error( f"could not load model from /outputs due to {str(ex)}, creating new model" ) self.create_new_model() def __init__(self): self.model = None def __convert_to_input(self, tenders): titles = list(map(lambda x: x.get_title("DE"), tenders)) return titles def classify(self, tenders): self.load_model() titles = self.__convert_to_input(tenders) predictions, raw_output = self.model.predict(titles) tuples = zip(tenders, predictions) selected_tenders = [t for t, p in tuples if p == 1] return selected_tenders def train(self, labelled_tenders): self.load_model() tenders = [i for i, j in labelled_tenders] tenders = self.__convert_to_input(tenders) labels = [j for i, j in labelled_tenders] tenders_train, tenders_test, labels_train, labels_test = train_test_split( tenders, labels, test_size=0.1, random_state=42) data_input = pd.DataFrame(zip(tenders_train, labels_train)) self.model.train_model(data_input) labels_pred, raw_output = self.model.predict(tenders_test) tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel() logger.info(f"tn: {tn} fp: {fp}") logger.info(f"fn: {fn} tp:{tp}") def create_new_model(self): from simpletransformers.classification import ClassificationModel self.model = ClassificationModel('bert', 'bert-base-german-cased', use_cuda=False, args=args)
def transformer(train_df, eval_df, datafile): #tokenizer = BertTokenizer.from_pretrained("bert-base-dutch-cased") model = ClassificationModel( "bert", "bert-base-dutch-cased", use_cuda=False, num_labels=2 ) # You can set class weights by using the optional weight argument # Train the model model.train_model(train_df) result, model_outputs, wrong_predictions = model.eval_model(eval_df) print(model_outputs) predlist = [] model1_outputs = model_outputs.tolist() for output in model1_outputs: if output[0] > output[1]: prediction = 0 else: prediction = 1 predlist.append(prediction) labels = eval_df["labels"].tolist() print(labels) print(predlist) print(classification_report(labels, predlist)) print(confusion_matrix(labels, predlist)) print(accuracy_score(labels, predlist))
def run_trainers(bucket_dir, train_args=None): os.makedirs('irl_models', exist_ok=True) if os.path.isfile('completed_irl.txt'): with open("completed_irl.txt", 'r') as f: done = [d.replace('\n', '') for d in f.readlines()] else: open('completed_irl.txt', 'a').close() with open("completed_irl.txt", 'r') as f: done = [d.replace('\n', '') for d in f.readlines()] for train_file in os.listdir(bucket_dir): print(train_file[5:]) print(done) if train_file[5:] not in done: train_df = pd.read_csv(bucket_dir + '/' + train_file + '/data_all.tsv', sep='\t') train_args['output_dir'] = f'irl_models/{train_file[5:]}/' train_args['cache_dir'] = f'cache_{train_file[5:]}/' train_args.update({'wandb_kwargs': {'name': train_file[5:]}}) model = ClassificationModel('roberta', 'roberta-base', args=train_args) print(train_df.head()) model.train_model(train_df) with open("completed_irl.txt", 'a') as f: f.write(f"{train_file[5:]}\n") exit() with open("done.runs", 'w') as f: f.write(f"Done at {datetime.datetime.now()}")
def objective(args): pbar.update(1) try: # cast np values to python and convert list to dict args = list(map(int, args[:3])) + list(map(float, args[3:])) args = dict( zip([ 'train_batch_size', 'gradient_accumulation_steps', 'weight_decay', 'learning_rate', 'learning_rate', 'adam_epsilon', 'warmup_ratio', 'max_grad_norm' ], args)) args['overwrite_output_dir'] = True args['eval_batch_size'] = args['train_batch_size'] model = ClassificationModel('albert', 'albert-base-v1', num_labels=5) # train model, find reverse f1, force garbage collection model.train_model(train, args=args) result, *_ = model.eval_model(test, f1=f1_multiclass, acc=accuracy_score) del model return 1. - result['f1'] except: print('skip') return 1.
def main(): # load train & test data df_train = pd.read_csv("sentiment_train.csv") df_test = pd.read_csv("sentiment_test.csv") #set random seed random = 42 # Train test split X_train, X_val, y_train, y_val = train_test_split(df_train['Sentence'], df_train['Polarity'], test_size=0.10, random_state=random) train_dataset = pd.concat([X_train, y_train], axis=1) val_dataset = pd.concat([X_val, y_val], axis=1) # Load a pre-trained model, and train it with our data | See all models available: https://huggingface.co/transformers/pretrained_models.html # Create model ... args = parameters args = { 'reprocess_input_data': True, 'max_seq_length': 300, 'num_train_epochs': 1, 'fp16': False, 'train_batch_size': 4, 'overwrite_output_dir': True } my_model = ClassificationModel('roberta', 'distilroberta-base', num_labels=2, use_cuda=True, cuda_device=0, args=args) # Train the model my_model.train_model(train_dataset) # Evaluate the model result, model_outputs, wrong_predictions = my_model.eval_model( val_dataset, acc=f1_score) pred_val = np.argmax(model_outputs, axis=1).tolist() print("Results on evaluation:") print("----------------------") print("F1 Score = {:.6f}\n".format( f1_score(y_val, pred_val, average='micro') * 100)) print(classification_report(y_val, pred_val)) print(confusion_matrix(y_val, pred_val)) # get results on test set pred_test, _ = my_model.predict(df_test['Sentence']) # print f1 score print(f1_score(df_test.Polarity, pred_test)) # print accuracy score print(accuracy_score(df_test.Polarity, pred_test)) # save input/ground truth/prediction as one csv df_test['prediction'] = pred_test df_test.to_csv('q3_ans.csv', index=False)
def fake_classify(train_set, eval_set, test_set, seed): # Create a TransformerModel model = ClassificationModel('bert', 'bert-base-multilingual-uncased', args={ 'num_train_epochs': 3, 'overwrite_output_dir': True, 'manual_seed': seed }, use_cuda=True) print(model.args) # Train the model model.train_model(train_set) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_set, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score) #print('Evaluation results = ', results(results)) #save the model #import torch #torch.save(model, path) --> no need to do this, model gets saved in output dir return result, model_outputs, wrong_predictions
def main(): f_path = 'Breast Cancer(Raw_data_2_Classes).csv' data = loadDataAsDataFrame(f_path) X = data y = data['Class'].tolist() training_set_size = int(0.8 * len(X)) training_rows, test_rows, training_classes, test_classes = train_test_split( X, y, train_size=training_set_size, random_state=42069) training_rows, test_rows, training_classes, test_classes = train_test_split( X, y, train_size=training_set_size, random_state=42069) model_args = {'overwrite_output_dir': True} # Create a TransformerModel model = ClassificationModel('roberta', 'roberta-base', use_cuda=False, args=model_args) #model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, args=model_args) #change our data into a format that simpletransformers can process training_rows['text'] = training_rows['Text'] training_rows['labels'] = training_rows['Class'] test_rows['text'] = test_rows['Text'] test_rows['labels'] = test_rows['Class'] # Train the model model.train_model(training_rows) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(test_rows) print("f1 score") precision = result['tp'] / (result['tp'] + result['fp']) recall = result['tp'] / (result['tp'] + result['fn']) f1score = 2 * precision * recall / (precision + recall) print(f1score)
def train(human_file, gen_file, our_gen_file, output_dir): data = [] data += [(i.strip(), 1) for i in open(human_file,'r').readlines()] data += [(i.strip(), 0) for i in open(gen_file,'r').readlines()] data += [(i.strip(), 0) for i in open(our_gen_file,'r').readlines()] all_df = pd.DataFrame(data) train_args = { 'overwrite_output_dir':True, 'num_train_epochs': 10, 'process_count': 10, 'train_batch_size': 10, 'eval_batch_size': 20, 'max_seq_length': 300, 'reprocess_input_data':True, 'learning_rate':1e-5, "evaluate_during_training": True, "use_early_stopping":True, 'early_stopping_patience':3, "early_stopping_metric": "eval_loss", "early_stopping_metric_minimize": True, "no_cache":True, 'output_dir':output_dir } model = ClassificationModel('roberta', "roberta-base", args=train_args) # You can set class weights by using the optional weight argument # Train the model model.train_model(all_df) print("finish the training")
def train_model(args, output_dir, cache_dir): """ Train a SimpleTransformers model based on the given arguments, save and return it. :param args: Arguments as processed by parse_args() containing architecture and epochs. :param output_dir: Path to the directory in which the model should be stored. :param cache_dir: Path to the directory in which the cache should be stored. :return: SimpleTransformers model trained based on the given arguments. """ print('=> Training model...') # Set model arguments model_args = { 'num_train_epochs': args.num_epochs, 'train_batch_size': 32, 'eval_batch_size': 32, 'output_dir': output_dir, 'cache_dir': cache_dir } # Train the model pretrained = get_transformer_model(args.arch) model = ClassificationModel(args.arch, pretrained, use_cuda=True, args=model_args) train = load_corpus('train') model.train_model(train) return model
def train(): wandb.init(WAND_PROJECT_NAME) modelArgs = { "max_seq_length": self.maxSeqLength, "output_dir": self.modelOutputDir, "overwrite_output_dir": True, "best_model_dir": self.bestModelOutputDir, "wandb_project": WAND_PROJECT_NAME, "num_training_epochs": wandb.config.epochs, "learning_rate": wandb.config.learning_rate, "do_lower_case": True, "cache_dir": self.modelCacheDir, "encoding": "utf-8", "train_batch_size": 5, "eval_batch_size": 5, "evaluate_during_training_steps": 50, "evaluate_during_training_verbose": True, "logging_steps": 5, "sliding_window": True, "reprocess_input_data": True, "evaluate_during_training": True, "use_multiprocessing": True, "labels_list": SECTOR_LABELS } model = ClassificationModel(self.modelType, self.modelNameOrPath, args=modelArgs, sweep_config=wandb.config, use_cuda=torch.cuda.is_available(), num_labels=len(SECTOR_LABELS), ) # Training and evaluation try: log.info(f"Started training/finetuning BERT on multi-class classification task..") model.train_model(train_df=self.trainDataset, eval_df=self.evalDataset, show_running_loss=True, output_dir=self.modelOutputDir, mcc=sklearn.metrics.matthews_corrcoef, acc=sklearn.metrics.balanced_accuracy_score, ) log.info(f"Finished finetuning and evaluating our fine-tuned model on multi-class classification task. Check the folder '{self.modelOutputDir}' for finetuned weights.") log.info(f"It took {round((time.time() - startTime) / 3600, 1)} hours to finetune and evaluate our fine-tuned model on multi-class classification task.") except: exc_type, exc_value, exc_traceback = sys.exc_info() err = f"Error occurred while training and evaluating the finetuned model on multi-class classification task. Error is: {exc_type}; {exc_value}." log.error(err) wandb.join()
def test_binary_classification(model_type, model_name): # Train and Evaluation data needs to be in a Pandas Dataframe of two columns. # The first column is the text with type str, and the second column is the # label with type int. train_data = [ ["Example sentence belonging to class 1", 1], ["Example sentence belonging to class 0", 0], ] train_df = pd.DataFrame(train_data) eval_data = [ ["Example eval sentence belonging to class 1", 1], ["Example eval sentence belonging to class 0", 0], ] eval_df = pd.DataFrame(eval_data) # Create a ClassificationModel model = ClassificationModel( model_type, model_name, use_cuda=False, args={ "reprocess_input_data": True, "overwrite_output_dir": True }, ) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(eval_df)
def train_stance_clf(data_dir, output_dir, **kwargs): headlines, bodies, labels = fnc( os.path.join(data_dir, 'combined_stances_train.csv'), os.path.join(data_dir, 'combined_bodies_train.csv')) list_of_tuples = list(zip(headlines, bodies, labels)) df = pd.DataFrame(list_of_tuples, columns=['text_a', 'text_b', 'label']) train_df, val_df = train_test_split(df, random_state=123) train_args = { 'learning_rate': 3e-5, 'num_train_epochs': 5, 'reprocess_input_data': True, 'overwrite_output_dir': False, 'process_count': 10, 'train_batch_size': 4, 'eval_batch_size': 20, 'max_seq_length': 300, "fp16": False, 'output_dir': output_dir } model = ClassificationModel('roberta', "roberta-base", num_labels=4, use_cuda=True, cuda_device=0, args=train_args) model.train_model(train_df)
def cross_pseudo_labeling(train, pseudo_test, test, params, n_folds, model_name, model_type, lb_hack): splits = list( StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1234).split(train["text"], train["label"])) splits_test = list( KFold(n_splits=n_folds, shuffle=True, random_state=1234).split(test["jobflag"])) y_pred = np.zeros((test.shape[0], n_folds)) oof = np.zeros(train.shape[0]) oof_raw = np.zeros((train.shape[0], n_folds)) weight = len(train) / train["label"].value_counts().sort_index().values f1_score = 0 for fold, (train_idx, valid_idx) in enumerate(splits): X_train = pd.concat([train.iloc[train_idx], pseudo_test]) X_valid = train.iloc[valid_idx] model = ClassificationModel(model_type=model_type, model_name=model_name, num_labels=4, args=params, use_cuda=True, weight=weight.tolist()) model.train_model(X_train) result, model_outputs, wrong_predictions = model.eval_model( X_valid, f1=metric_f1) print(result) f1_score += result["f1"] / n_folds fold_pred, raw_outputs = model.predict(test["description"].values) # y_pred[:, fold] = hack(raw_outputs) y_pred[:, :] = raw_outputs / n_folds oof_pred, oof_outputs = model.predict( X_valid["text"].values) # 謎のバグが発生するので変換 oof[valid_idx] = oof_pred oof_raw[valid_idx, :] = oof_outputs # oof[valid_idx] = hack(oof_outputs) print(f"mean f1_score: {f1_score}") raw_pred = y_pred.copy() y_pred = hack(y_pred, lb_hack) # oof = hack(oof_raw) # y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int) test_pred = pd.DataFrame( np.concatenate([y_pred.reshape(-1, 1), raw_pred], 1)) oof_pred = pd.DataFrame(np.concatenate([oof.reshape(-1, 1), oof_raw], 1)) return test_pred, f1_score, oof_pred
def main(source=source, data_dir='data', checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo", best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo', n_train=240000, n_valid=4000, n_test=4000, n_epochs=10, learning_rate=4e-05, train_batch_size=64, eval_batch_size=64, evaluate_during_training=True, evaluate_during_training_steps=2000, reprocess_input=True, overwrite_output_dir=True, n_gpu=2): # import pdb; pdb.set_trace() train_df = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_df = data_loading.load_split(data_dir, source_test, 'valid', n=n_valid) test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test) # Optional model configuration model_args = ClassificationArgs( num_train_epochs=n_epochs, evaluate_during_training=evaluate_during_training, evaluate_during_training_steps=evaluate_during_training_steps, best_model_dir=best_model_dir, manual_seed=0, train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, overwrite_output_dir=overwrite_output_dir, n_gpu=n_gpu, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, learning_rate=learning_rate) # Create a ClassificationModel model = ClassificationModel("roberta", model_name="roberta-large", args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
class TransformerModel(TenderClassClassifier): """ This class provides the Machine Learning model and classifies tenders based on previous training data. """ def __init__(self): self.model = None def load(self, name): self.model = ClassificationModel('bert', './outputs/', use_cuda=cuda_available, args=args) def save(self, name): pass def __convert_to_input(self, tenders): titles = list(map(lambda x: x.get_title("DE"), tenders)) return titles def classify(self, tenders): titles = self.__convert_to_input(tenders) predictions, raw_output = self.model.predict(titles) tuples = zip(tenders, predictions) selected_tenders = [t for t, p in tuples if p == 1] return selected_tenders def train(self, labelled_tenders): tenders = [i for i, j in labelled_tenders] tenders = self.__convert_to_input(tenders) labels = [j for i, j in labelled_tenders] tenders_train, tenders_test, labels_train, labels_test = train_test_split( tenders, labels, test_size=0.1, random_state=42) data_input = pd.DataFrame(zip(tenders_train, labels_train)) start = time.time() self.model.train_model(data_input) end = time.time() print(end - start) labels_pred, raw_output = self.model.predict(tenders_test) tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel() logger.info(f"tn: {tn} fp: {fp}") logger.info(f"fn: {fn} tp:{tp}") logger.info( f"Accuracy Score: {accuracy_score(labels_test, labels_pred)}") def create_new_model(self): from simpletransformers.classification import ClassificationModel self.model = ClassificationModel('bert', 'bert-base-german-cased', use_cuda=cuda_available, args=args)
def finalmodel(outfolder): # type: (str) -> None """ Trains the BERT model using the parameters currently set in buildbertargs(). The parameters have been explored with a train/test split, so this training is with the full dataset. :param outfolder: the folder to write the model to :type outfolder: str """ rawdata = helpers.refdf.copy(deep=True) print('Raw data: ' + str(rawdata.shape)) rawdata.set_index('Clause ID', inplace=True) # sourcedata = helpers.dedupdf.copy(deep=True) # print('Deduped data: ' + str(sourcedata.shape)) sourcedata = helpers.refdf.copy(deep=True) print('Raw data: ' + str(sourcedata.shape)) sourcedata = sourcedata[sourcedata['Clause Text'].map(helpers.goodsize)] print('Sized data: ' + str(sourcedata.shape)) sourcedata.set_index('Clause ID', inplace=True) traindata = pd.DataFrame( { 'text': sourcedata['Clause Text'], 'labels': sourcedata['Classification'] }, index=sourcedata.index) evaldata = pd.DataFrame( { 'text': rawdata['Clause Text'], 'labels': rawdata['Classification'] }, index=rawdata.index) print('Data for BERT: ' + str(traindata.shape)) accargs = buildbertargs() accargs.output_dir = outfolder accmodel = ClassificationModel('roberta', 'roberta-base', args=accargs, weight=[2, 1]) accmodel.train_model(traindata) print('---------------') print('Training Data Eval:') result, model_outputs, wrong_predictions = accmodel.eval_model(traindata) print(result) print('---------------') print('Full Data Eval:') result, model_outputs, wrong_predictions = accmodel.eval_model(evaldata) # {'mcc': 0.9062028924099057, 'tp': 4835, 'tn': 1368, 'fp': 74, 'fn': 140, 'eval_loss': 0.18330956540325125} print(result)
def train(): wandb.init(WAND_PROJECT_NAME) modelArgs = { "max_seq_length": self.maxSeqLength, "output_dir": self.modelOutputDir, "overwrite_output_dir": True, "best_model_dir": self.bestModelOutputDir, "wandb_project": WAND_PROJECT_NAME, "num_training_epochs": wandb.config.epochs, "learning_rate": wandb.config.learning_rate, "do_lower_case": True, "cache_dir": self.modelCacheDir, "encoding": "utf-8", "train_batch_size": 5, "eval_batch_size": 5, "evaluate_during_training_steps": 50, "evaluate_during_training_verbose": True, "logging_steps": 5, "sliding_window": True, "reprocess_input_data": True, "evaluate_during_training": True, "use_multiprocessing": False, "regression": True } model = ClassificationModel(self.modelType, self.modelNameOrPath, args=modelArgs, sweep_config=wandb.config, use_cuda=torch.cuda.is_available(), num_labels=1) # Training try: log.info( f"Started finetuning BERT on sentiment analysis/regression task.." ) model.train_model( train_df=self.trainDataFrame, eval_df=self.evalDataFrame, show_running_loss=True, output_dir=self.modelOutputDir, mse=sklearn.metrics.mean_squared_error, r2Score=sklearn.metrics.r2_score, ) log.info( f"Finished training and evaluation of our finetuned model on sentiment analysis/regression task. Check the folder '{self.modelOutputDir}' for finetuned weights." ) log.info( f"It took {round((time.time() - startTime) / 3600, 1)} hours to train/finetune BERT model on sentiment analysis/regression task." ) except: exc_type, exc_value, exc_traceback = sys.exc_info() err = f"Error occurred while training finetuned model on sentiment analysis/regression task. Error is: {str(exc_type)}; {str(exc_value)}." log.error(err) wandb.join()
def run_trainer(self): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) print('output dir: ' + self.output_dir) model_args = { 'max_seq_length': self.max_seq_length, 'learning_rate': 4e-5, 'num_train_epochs': self.epochs, 'reprocess_input_data': True, 'overwrite_output_dir': True, 'evaluate_during_training': True, 'evaluate_during_training_steps': 800, # 400 'best_model_dir': '{}/best-models'.format(self.output_dir), 'logging_steps': 100, # 50 'do_lower_case': True, 'train_batch_size': self.batch_size, 'use_batch_norm': False, 'tensorboard_dir': '{}/runs'.format(self.output_dir), 'early_stopping_patience': 1, 'save_only_best': True, 'overwrite_last_saved': True, 'save_steps': 0, 'wandb_project': 'gallery', } # Create a ClassificationModel model = ClassificationModel(self.model_name, self.model_name + "-base-uncased", num_labels=self.num_labels, args=model_args, use_cuda=self.use_cuda) # Train the model # model.train_model(self.train_df) # model.train_model(self.train_df, output_dir=output_dir, eval_df=test_x, acc=accuracy_score) model.train_model(self.train_df, output_dir=self.output_dir, eval_df=self.eval_df, acc=accuracy_score) # Evaluate the model # eval_df, multi_label=False, output_dir=None, verbose=True, silent=False, wandb_log=True, **kwargs result, model_outputs, wrong_predictions = model.eval_model( eval_df=self.eval_df, multi_label=False, output_dir=self.output_dir, verbose=True, silent=False, wandb_log=True) print("result: ", result) return model
def trainer(train_df,OUTPUT_DIR,preproc,args): script_dir = os.path.dirname(__file__) abs_file_path = os.path.join(script_dir, args.modelConf) with open(abs_file_path) as f: model_param = json.loads(f.read()) model_param['output_dir'] = OUTPUT_DIR print(model_param) model_name = 'bert-large-cased' model = ClassificationModel('bert', model_name, args=model_param,num_labels=3) model.train_model(train_df); return model
def get_bert_base(train_sequences, dev_sequences, train_targets, dev_targets, time_constraint=1, num_cpu=1, max_features=1000, model="bert-base", weights_dir="transformers_trained", cuda=False): 'text' 'labels' total_sequences_training = train_sequences.values.tolist( ) + dev_sequences.values.tolist() total_labels_training = train_targets.tolist() + dev_targets.tolist() train_df = pd.DataFrame() train_df['text'] = total_sequences_training train_df['labels'] = total_labels_training # Create a ClassificationModel if model == "bert-base": model = ClassificationModel('bert', 'bert-base-cased', num_labels=len(set(total_labels_training)), args={ 'reprocess_input_data': True, 'overwrite_output_dir': True, "output_hidden_states": True }, use_cuda=cuda) elif model == "roberta-base": model = ClassificationModel('roberta', 'roberta-base', num_labels=len(set(total_labels_training)), args={ 'output_hidden_states': True, 'reprocess_input_data': True, 'overwrite_output_dir': True }, use_cuda=cuda) model.args['num_train_epochs'] = 1 model.args['max_sequence_length'] = 256 model.args['save_eval_checkpoints'] = False model.args['save_model_every_epoch'] = False model.args['output_dir'] = weights_dir model.args['save_steps'] = 400 # Train the model model.train_model(train_df) return model
class Classifier: def __init__(self, model_type, model_name, use_cuda=True): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Create a ClassificationModel self.model_type = model_type self.model_name = model_name self.use_cuda = use_cuda self.dat = {} self.rerun = False def add(self, X, Y): self.dat[Y] = X def train(self, split=0.7, num_epochs=10): self.le = preprocessing.LabelEncoder() print(list(self.dat.keys())) self.le.fit(list(self.dat.keys())) train_data = [] eval_data = [] for k, v in self.dat.items(): len_train = int(round(len(v) * split)) train_data.extend([[i, self.le.transform([k])[0]] for i in v[:len_train]]) eval_data.extend([[i, self.le.transform([k])[0]] for i in v[len_train:]]) print(train_data, eval_data) train_df = pd.DataFrame(train_data) eval_df = pd.DataFrame(eval_data) train_args = { 'overwrite_output_dir': True, 'num_train_epochs': num_epochs, } self.model = ClassificationModel(self.model_type, self.model_name, num_labels=len(list(self.dat.keys())), use_cuda=self.use_cuda, cuda_device=0, args=train_args) # Train the model self.model.train_model(train_df, eval_df=eval_df) # Evaluate the model result, model_outputs, wrong_predictions = self.model.eval_model( eval_df, acc=sklearn.metrics.accuracy_score) def predict(self, x): predictions, raw_outputs = self.model.predict(x) return self.le.inverse_transform(predictions)
def train(train_dataset, valid_dataset): logging.debug("Training is going to start") args = {} args.update(global_args["model_params"]) model = ClassificationModel( "roberta", "models/best_model/", use_cuda=False, args=global_args["model_params"], ) model.train_model(train_dataset, eval_df=valid_dataset) logging.debug("Training is done")
def main(): script_info = pd.read_csv('./data/IMSDB/final_movie_budgets.csv', sep=',') script_info['Budget'] = [ int(bud.replace(',', '')) for bud in script_info['Budget'] ] # reformatting budget # creating Budget Categories by quartile script_info['Bud_Cat'] = pd.qcut(script_info['Budget'], 2, labels=[0, 1]) # get list of scripts from data folder scripts = [] for file in script_info['Filename']: with open(file, 'r') as txt: scripts.append(txt.read().replace('\n', '')) X_train, X_test, y_train, y_test = train_test_split(scripts, script_info['Bud_Cat'], test_size=0.2, random_state=0) docs = [ ' '.join(tokenize_script(script, stop_words=True)) for script in X_train ] train_docs = [list(x) for x in zip(docs, y_train)] train_df = pd.DataFrame(train_docs) train_df.columns = ["text", "labels"] docs = [ ' '.join(tokenize_script(script, stop_words=True)) for script in X_test ] test_docs = [list(x) for x in zip(docs, y_test)] test_df = pd.DataFrame(test_docs) test_df.columns = ["text", 'labels'] model_args = ClassificationArgs(sliding_window=True, overwrite_output_dir=True) model = ClassificationModel("roberta", "roberta-base", args=model_args, use_cuda=True, n_epochs=3) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(test_df) print(result)
def run(model_name=("distilbert", "distilbert-base-uncased")): # TODO: make directories in VM training_data, test_data = get_dataset() model = ClassificationModel(model_name[0], model_name[1]) output_dir_train = "./saved_states/category3/" + model_name[0] output_dir_eval = "./results/category3/" + model_name[0] # create paths if they do not exist from pathlib import Path Path(output_dir_train).mkdir(parents=True, exist_ok=True) Path(output_dir_eval).mkdir(parents=True, exist_ok=True) model.train_model(training_data, args={"overwrite_output_dir": True}, output_dir=output_dir_train) result, model_outputs, wrong_predictions = model.eval_model(test_data, output_dir=output_dir_eval)
def train_model(model_type, model_name, training_size): print('Starting run:', model_type, model_name, training_size) train_df = pd.read_csv("data/train_" + training_size + ".csv", header=None) train_df["text"] = train_df.iloc[:, 1] + " " + train_df.iloc[:, 2] train_df = train_df.drop(train_df.columns[[1, 2]], axis=1) train_df.columns = ["labels", "text"] train_df["text"] = train_df["text"].apply(lambda x: x.replace("\\", " ")) train_df["labels"] = train_df["labels"].apply(lambda x: x - 1) eval_df = pd.read_csv("data/test.csv", header=None) eval_df["text"] = eval_df.iloc[:, 1] + " " + eval_df.iloc[:, 2] eval_df = eval_df.drop(eval_df.columns[[1, 2]], axis=1) eval_df.columns = ["labels", "text"] eval_df["text"] = eval_df["text"].apply(lambda x: x.replace("\\", " ")) eval_df["labels"] = eval_df["labels"].apply(lambda x: x - 1) t0 = time.time() train_args = { 'output_dir': f'model-outputs/{model_type}-{model_name}-{training_size}-outputs', 'max_seq_length': 256, 'num_train_epochs': 3, 'train_batch_size': 16, 'eval_batch_size': 32, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-5, 'save_steps': 50000, 'evaluate_during_training': True, 'evaluate_during_training_steps': 1000, 'reprocess_input_data': True, 'save_model_every_epoch': False, 'overwrite_output_dir': True, 'no_cache': True, 'use_early_stopping': True, 'early_stopping_patience': 3, 'manual_seed': 4, } model = ClassificationModel(model_type, model_name, num_labels=4, args=train_args) model.train_model(train_df, eval_df=eval_df) print('Run finished') t1 = time.time() total = t1 - t0 print('Time:', total) print('--------------------')