logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Preparing train data train_df = load_rte_data_file("data/train.jsonl") eval_df = pd.read_json("data/eval_df", lines=True, orient="records") test_df = pd.read_json("data/test_df", lines=True, orient="records") model_args = ClassificationArgs() model_args.eval_batch_size = 8 model_args.evaluate_during_training = True model_args.evaluate_during_training_silent = False model_args.evaluate_during_training_steps = 1000 model_args.learning_rate = 4e-4 model_args.manual_seed = 4 model_args.max_seq_length = 256 model_args.multiprocessing_chunksize = 5000 model_args.no_cache = True model_args.no_save = True model_args.num_train_epochs = 10 model_args.overwrite_output_dir = True model_args.reprocess_input_data = True model_args.train_batch_size = 16 model_args.gradient_accumulation_steps = 2 model_args.train_custom_parameters_only = False model_args.labels_list = ["not_entailment", "entailment"] model_args.wandb_project = "RTE - Hyperparameter Optimization"
train_df.columns = ["text", "labels"] eval_df = df.iloc[wandb_config["samples"]:, :] eval_df.columns = ["text", "labels"] model_args = ClassificationArgs() model_args.num_train_epochs = wandb_config["epochs"] model_args.eval_batch_size = wandb_config["eval_batch_size"] model_args.train_batch_size = wandb_config["train_batch_size"] model_args.wandb_project = "transformer-aes" model_args.wandb_kwargs = { "name": "{}-{}".format(wandb_config["model"], wandb_config["samples"]) } model_args.learning_rate = wandb_config["lr"] model_args.model = wandb_config["model"] model_args.samples = wandb_config["samples"] # model_args.max_seq_length = wandb_config["max_seq_length"] model_args.regression = True model_args.no_save = True model_args.overwrite_output_dir = True model_args.logging_steps = 1 model_args.evaluate_during_training = True model_args.evaluate_during_training_verbose = True model_args.evaluate_during_training_steps = np.ceil( (wandb_config["samples"] / wandb_config["train_batch_size"]) * 10) model_args.use_eval_cached_features = True model = ClassificationModel(wandb_config["model"], wandb_config["save"],
logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Preparing train data train_df = load_rte_data_file("data/train.jsonl") eval_df = pd.read_json("data/eval_df", lines=True, orient="records") test_df = pd.read_json("data/test_df", lines=True, orient="records") model_args = ClassificationArgs() model_args.eval_batch_size = 32 model_args.evaluate_during_training = True model_args.evaluate_during_training_silent = False model_args.evaluate_during_training_steps = -1 model_args.learning_rate = 0.00003173 model_args.manual_seed = 4 model_args.max_seq_length = 256 model_args.multiprocessing_chunksize = 5000 model_args.no_cache = True model_args.no_save = True model_args.num_train_epochs = 40 model_args.overwrite_output_dir = True model_args.reprocess_input_data = True model_args.train_batch_size = 16 model_args.gradient_accumulation_steps = 2 model_args.train_custom_parameters_only = False model_args.labels_list = ["not_entailment", "entailment"] model_args.output_dir = "vanilla_output" model_args.best_model_dir = "vanilla_output/best_model" model_args.wandb_project = "RTE - Hyperparameter Optimization"
def bert_training(model_type, model_base, train_data, early_stop, early_stop_delta, overwrite, epoch, batch_size, learning_rate, output): # Bringing in the training data with open(train_data, 'r') as json_file: json_list = list(json_file) for json_str in json_list: train.append(json.loads(json_str)) # Data cleaning train_labels = [train[i]['label'] for i in range(len(train))] train_response = [ remove_stopwords(convert_emojis(train[i]['response'])) for i in range(len(train)) ] # Split data into training and test sets labels_train, labels_test, response_train, response_test = train_test_split( train_labels, train_response, test_size=0.2, random_state=42) # Convert SARCASM/NO SARCASM labels into 1s and 0s labels_train_pd = (pd.DataFrame(labels_train) == 'SARCASM').astype(int) labels_test_pd = (pd.DataFrame(labels_test) == 'SARCASM').astype(int) response_train_pd = pd.DataFrame(response_train) response_test_pd = pd.DataFrame(response_test) train_bert = pd.DataFrame({ 'text': response_train_pd[0].replace(r'\n', ' ', regex=True), 'label': labels_train_pd[0] }) eval_bert = pd.DataFrame({ 'text': response_test_pd[0].replace(r'\n', ' ', regex=True), 'label': labels_test_pd[0] }) model_args = ClassificationArgs() model_args.use_early_stopping = early_stop model_args.early_stopping_delta = early_stop_delta model_args.overwrite_output_dir = overwrite model_args.num_train_epochs = epoch model_args.train_batch_size = batch_size model_args.learning_rate = learning_rate model_args.output_dir = output # Create a TransformerModel model = ClassificationModel(model_type, model_base, use_cuda=False, args=model_args) # Train the model model.train_model(train_bert) # Evaluate the model model.eval_model(eval_bert)
# Preparing train data # train_df = load_rte_data_file("data/train.jsonl") train_df = pd.read_json("data/augmented_train.jsonl", lines=True, orient="records") eval_df = pd.read_json("data/eval_df", lines=True, orient="records") test_df = pd.read_json("data/test_df", lines=True, orient="records") model_args = ClassificationArgs() model_args.eval_batch_size = 32 model_args.evaluate_during_training = True model_args.evaluate_during_training_silent = False model_args.evaluate_during_training_steps = -1 model_args.save_eval_checkpoints = False model_args.save_model_every_epoch = False model_args.learning_rate = 1e-5 model_args.manual_seed = 4 model_args.max_seq_length = 256 model_args.multiprocessing_chunksize = 5000 model_args.no_cache = True model_args.num_train_epochs = 3 model_args.overwrite_output_dir = True model_args.reprocess_input_data = True model_args.train_batch_size = 16 model_args.gradient_accumulation_steps = 2 model_args.labels_list = ["not_entailment", "entailment"] model_args.output_dir = "default_output" model_args.best_model_dir = "default_output/best_model" model_args.wandb_project = "RTE - Hyperparameter Optimization" model_args.wandb_kwargs = {"name": "augmented-default"}
def cross_validation(lang): print(lang) model_name = 'EMBEDDIA/crosloengual-bert' logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger('transformers') transformers_logger.setLevel(logging.WARNING) # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. # If the Dataframe has a header, it should contain a 'text' and a 'labels' column. # If no header is present, the Dataframe should contain at least two columns, # with the first column is the text with type str, and the second column in the label with type int. accs = [] f1s = [] df = load_single_lang(lang) kf = KFold(n_splits=10) for train_index, test_index in kf.split(df.index): df_train = df.iloc[train_index] df_test = df.iloc[test_index] # hyperparameters model_args = ClassificationArgs() model_args.logging_steps = 1000000 model_args.save_eval_checkpoints = False model_args.save_steps = 1000000 model_args.no_cache = True model_args.save_model_every_epoch = False model_args.num_train_epochs = 1 model_args.learning_rate = 2e-4 model_args.train_batch_size = 32 model_args.overwrite_output_dir = True ''' model_args.train_custom_parameters_only = True model_args.custom_parameter_groups = [ { "params": ["classifier.weight"], "lr": 2e-4, }, { "params": ["classifier.bias"], "lr": 2e-4, "weight_decay": 0.0, }, ] ''' # Create a ClassificationModel model = ClassificationModel('bert', model_name, num_labels=3, args=model_args) print(model.get_named_parameters()) # Train the model print('Training ...') model.train_model(df_train) # Evaluate the model print('Evaluating ...') predictions, raw_outputs = model.predict(df_test['text'].values) out = eval(df_test['labels'].values, predictions) accs.append(out['acc']) f1s.append(out['avg_f1']) del model # write results to file with open('results_csebert.txt', 'a+') as f: f.write("{} {} {}\n".format(lang, statistics.mean(accs), statistics.mean(f1s)))
def fit_and_evaluate(train_langs, test_lang): print(train_langs, test_lang) model_name = 'EMBEDDIA/crosloengual-bert' logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger('transformers') transformers_logger.setLevel(logging.WARNING) # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. # If the Dataframe has a header, it should contain a 'text' and a 'labels' column. # If no header is present, the Dataframe should contain at least two columns, # with the first column is the text with type str, and the second column in the label with type int. df_train, df_test = load_dataset(train_langs, test_lang, train_on_test_lang=True) # hyperparameters model_args = ClassificationArgs() model_args.logging_steps = 1000000 model_args.save_eval_checkpoints = False model_args.save_steps = 1000000 model_args.no_cache = True model_args.save_model_every_epoch = False model_args.num_train_epochs = 1 model_args.learning_rate = 2e-4 model_args.train_batch_size = 32 model_args.overwrite_output_dir = True ''' model_args.train_custom_parameters_only = True model_args.custom_parameter_groups = [ { "params": ["classifier.weight"], "lr": 2e-4, }, { "params": ["classifier.bias"], "lr": 2e-4, "weight_decay": 0.0, }, ] ''' # Create a ClassificationModel model = ClassificationModel('bert', model_name, num_labels=3, args=model_args) print(model.get_named_parameters()) # Train the model print('Training ...') model.train_model(df_train) # Evaluate the model print('Evaluating ...') predictions, raw_outputs = model.predict(df_test['text'].values) out = eval(df_test['labels'].values, predictions) # write results to file with open('results_csebert.txt', 'a+') as f: f.write("{} {} {}\n".format(train_langs, test_lang, out)) del model