def cross_validation(lang): print(lang) model_name = 'EMBEDDIA/crosloengual-bert' logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger('transformers') transformers_logger.setLevel(logging.WARNING) # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. # If the Dataframe has a header, it should contain a 'text' and a 'labels' column. # If no header is present, the Dataframe should contain at least two columns, # with the first column is the text with type str, and the second column in the label with type int. accs = [] f1s = [] df = load_single_lang(lang) kf = KFold(n_splits=10) for train_index, test_index in kf.split(df.index): df_train = df.iloc[train_index] df_test = df.iloc[test_index] # hyperparameters model_args = ClassificationArgs() model_args.logging_steps = 1000000 model_args.save_eval_checkpoints = False model_args.save_steps = 1000000 model_args.no_cache = True model_args.save_model_every_epoch = False model_args.num_train_epochs = 1 model_args.learning_rate = 2e-4 model_args.train_batch_size = 32 model_args.overwrite_output_dir = True ''' model_args.train_custom_parameters_only = True model_args.custom_parameter_groups = [ { "params": ["classifier.weight"], "lr": 2e-4, }, { "params": ["classifier.bias"], "lr": 2e-4, "weight_decay": 0.0, }, ] ''' # Create a ClassificationModel model = ClassificationModel('bert', model_name, num_labels=3, args=model_args) print(model.get_named_parameters()) # Train the model print('Training ...') model.train_model(df_train) # Evaluate the model print('Evaluating ...') predictions, raw_outputs = model.predict(df_test['text'].values) out = eval(df_test['labels'].values, predictions) accs.append(out['acc']) f1s.append(out['avg_f1']) del model # write results to file with open('results_csebert.txt', 'a+') as f: f.write("{} {} {}\n".format(lang, statistics.mean(accs), statistics.mean(f1s)))
model_args = ClassificationArgs() model_args.num_train_epochs = wandb_config["epochs"] model_args.eval_batch_size = wandb_config["eval_batch_size"] model_args.train_batch_size = wandb_config["train_batch_size"] model_args.wandb_project = "transformer-aes" model_args.wandb_kwargs = { "name": "{}-{}".format(wandb_config["model"], wandb_config["samples"]) } model_args.learning_rate = wandb_config["lr"] model_args.model = wandb_config["model"] model_args.samples = wandb_config["samples"] # model_args.max_seq_length = wandb_config["max_seq_length"] model_args.regression = True model_args.no_save = True model_args.overwrite_output_dir = True model_args.logging_steps = 1 model_args.evaluate_during_training = True model_args.evaluate_during_training_verbose = True model_args.evaluate_during_training_steps = np.ceil( (wandb_config["samples"] / wandb_config["train_batch_size"]) * 10) model_args.use_eval_cached_features = True model = ClassificationModel(wandb_config["model"], wandb_config["save"], num_labels=1, args=model_args) model.train_model(train_df, eval_df=eval_df, mse=sklearn.metrics.mean_squared_error, mae=sklearn.metrics.mean_absolute_error,
def fit_and_evaluate(train_langs, test_lang): print(train_langs, test_lang) model_name = 'EMBEDDIA/crosloengual-bert' logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger('transformers') transformers_logger.setLevel(logging.WARNING) # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. # If the Dataframe has a header, it should contain a 'text' and a 'labels' column. # If no header is present, the Dataframe should contain at least two columns, # with the first column is the text with type str, and the second column in the label with type int. df_train, df_test = load_dataset(train_langs, test_lang, train_on_test_lang=True) # hyperparameters model_args = ClassificationArgs() model_args.logging_steps = 1000000 model_args.save_eval_checkpoints = False model_args.save_steps = 1000000 model_args.no_cache = True model_args.save_model_every_epoch = False model_args.num_train_epochs = 1 model_args.learning_rate = 2e-4 model_args.train_batch_size = 32 model_args.overwrite_output_dir = True ''' model_args.train_custom_parameters_only = True model_args.custom_parameter_groups = [ { "params": ["classifier.weight"], "lr": 2e-4, }, { "params": ["classifier.bias"], "lr": 2e-4, "weight_decay": 0.0, }, ] ''' # Create a ClassificationModel model = ClassificationModel('bert', model_name, num_labels=3, args=model_args) print(model.get_named_parameters()) # Train the model print('Training ...') model.train_model(df_train) # Evaluate the model print('Evaluating ...') predictions, raw_outputs = model.predict(df_test['text'].values) out = eval(df_test['labels'].values, predictions) # write results to file with open('results_csebert.txt', 'a+') as f: f.write("{} {} {}\n".format(train_langs, test_lang, out)) del model