def main(source=source, data_dir='data', checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo", best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo', n_train=240000, n_valid=4000, n_test=4000, n_epochs=10, learning_rate=4e-05, train_batch_size=64, eval_batch_size=64, evaluate_during_training=True, evaluate_during_training_steps=2000, reprocess_input=True, overwrite_output_dir=True, n_gpu=2): # import pdb; pdb.set_trace() train_df = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_df = data_loading.load_split(data_dir, source_test, 'valid', n=n_valid) test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test) # Optional model configuration model_args = ClassificationArgs( num_train_epochs=n_epochs, evaluate_during_training=evaluate_during_training, evaluate_during_training_steps=evaluate_during_training_steps, best_model_dir=best_model_dir, manual_seed=0, train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, overwrite_output_dir=overwrite_output_dir, n_gpu=n_gpu, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, learning_rate=learning_rate) # Create a ClassificationModel model = ClassificationModel("roberta", model_name="roberta-large", args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
def main ( source='xl-1542M-k40;xl-1542M', data_dir='data', load_model_dir="outputs/checkpoint-15626-epoch-2", checkpoint_dir="outputs", n_train=250000, n_valid=10000, n_test=np.inf, reprocess_input=False, ): transformers_logger.info(f'source: {source}, checkpoint_dir: {checkpoint_dir}') pristine_articles = extract_articles('data/pristine') manipulated_articles = extract_articles('data/Manipulated') test_texts = pristine_articles + manipulated_articles test_labels = [0]*len(pristine_articles) + [1]*len(manipulated_articles) print(f'Testing {len(test_texts)} artifles, of which {len(pristine_articles)} are pristine and {len(manipulated_articles)} are manipulated') # Preparing test data test_data = { 'text':test_texts, 'labels':test_labels } test_df = pd.DataFrame(data=test_data) # Optional model configuration model_args = ClassificationArgs( output_dir=checkpoint_dir, num_train_epochs=2, evaluate_during_training=True, save_steps=25000, evaluate_during_training_steps=25000, manual_seed=0, train_batch_size=256, eval_batch_size=256, overwrite_output_dir=True, reprocess_input_data=reprocess_input, n_gpu=2, ) # Create a ClassificationModel model = ClassificationModel( "roberta", load_model_dir, args=model_args, use_cuda=True ) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score )
def main(): script_info = pd.read_csv('./data/IMSDB/final_movie_budgets.csv', sep=',') script_info['Budget'] = [ int(bud.replace(',', '')) for bud in script_info['Budget'] ] # reformatting budget # creating Budget Categories by quartile script_info['Bud_Cat'] = pd.qcut(script_info['Budget'], 2, labels=[0, 1]) # get list of scripts from data folder scripts = [] for file in script_info['Filename']: with open(file, 'r') as txt: scripts.append(txt.read().replace('\n', '')) X_train, X_test, y_train, y_test = train_test_split(scripts, script_info['Bud_Cat'], test_size=0.2, random_state=0) docs = [ ' '.join(tokenize_script(script, stop_words=True)) for script in X_train ] train_docs = [list(x) for x in zip(docs, y_train)] train_df = pd.DataFrame(train_docs) train_df.columns = ["text", "labels"] docs = [ ' '.join(tokenize_script(script, stop_words=True)) for script in X_test ] test_docs = [list(x) for x in zip(docs, y_test)] test_df = pd.DataFrame(test_docs) test_df.columns = ["text", 'labels'] model_args = ClassificationArgs(sliding_window=True, overwrite_output_dir=True) model = ClassificationModel("roberta", "roberta-base", args=model_args, use_cuda=True, n_epochs=3) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(test_df) print(result)
def model_train(data, output_dir='models/'): ''' Trains a roberta model based on input data. Inputs: data (pd.DataFrame): with columns content (text), ground_truth_risk (label), probability_risk. output_dir (str): output path or directory to save model. Output: None, model will already be saved in specified output directory. ''' # extract relevant columns df = pd.DataFrame(data[['content', 'ground_truth_risk', 'probability_risk']]) # if ground truth risk is NA, convert probability to ground truth risk to train df_NA = df[df['ground_truth_risk'].isnull()] df_NA['ground_truth_risk'] = df_NA.apply(lambda x: to_binary(x['probability_risk']), axis=1) # update df df = df.dropna(subset=['ground_truth_risk']) df = pd.concat([df, df_NA], ignore_index=True) # format df for training df = pd.DataFrame(df[['content', 'ground_truth_risk']]) # rename columns - requirement of the simpletransformers package df = df.rename({'content': 'text', 'ground_truth_risk': 'labels'}, axis=1) # processing text column df['text'] = df.apply(lambda x: text_processing(x.text, lower=False, remove_url=True, remove_punctuation=False, remove_stopwords=False, replace_entity=True, replace_hash=True, split_alphanumeric=False, lemmatize=False, stem=False), axis=1) # initialise Model model_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5, \ output_dir=output_dir) model = ClassificationModel(model_type = 'roberta', model_name = 'roberta-base', \ args = model_args, use_cuda = False) # train the model model.train_model(df) return
def main ( source=source, data_dir='data', load_model_dir="outputs/eval2/best_model_openai_finetune_1", checkpoint_dir="outputs/eval2/test_xl-1542M-nucleus_eval2_analytic", n_train=250000, n_valid=10000, n_test=np.inf, reprocess_input=True, ): transformers_logger.info(f'source: {source}, checkpoint_dir: {checkpoint_dir}') test_df = data_loading.load_split(data_dir, source, 'test', n=n_test) # Optional model configuration model_args = ClassificationArgs( output_dir=checkpoint_dir, num_train_epochs=2, evaluate_during_training=True, save_steps=25000, evaluate_during_training_steps=25000, manual_seed=0, train_batch_size=256, eval_batch_size=256, overwrite_output_dir=True, reprocess_input_data=reprocess_input, n_gpu=2, no_cache=True, ) # Create a ClassificationModel model = ClassificationModel( "roberta", # model_name="roberta-large-openai-detector", load_model_dir, args=model_args, use_cuda=True ) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer )
def main(): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Preparing train data train_df = pd.read_csv("data/train.csv") train_df = train_df[["comment_text", "target"]] train_df = clean_text(train_df, "comment_text") # train_df["target"] = class_labels(train_df["target"]) train_df.columns = ["text", "labels"] # train_df["labels"] = train_df["labels"].astype(int) # Duplicate the data that is toxic train_df = train_df.append([train_df[train_df["labels"] > 0]] * 5) # Preparing eval data eval_df = pd.read_csv("data/test_public_expanded.csv") eval_df = eval_df[["comment_text", "toxicity"]] eval_df = clean_text(eval_df, "comment_text") # eval_df["toxicity"] = class_labels(eval_df["toxicity"]) eval_df.columns = ["text", "labels"] train_df.to_csv("data/train_clean.csv", sep=",", index=False) eval_df.to_csv("data/eval_clean.csv", sep=",", index=False) # Optional model configuration model_args = ClassificationArgs(num_train_epochs=1, lazy_loading=True, lazy_labels_column=1, lazy_text_column=0, lazy_delimiter=',', regression=True) # Create a ClassificationModel model = ClassificationModel("roberta", "roberta-base", use_cuda=False, args=model_args) # Train the model # pdb.set_trace() model.train_model("data/train_clean.csv") # Evaluate the model # pdb.set_trace() result, model_outputs, wrong_predictions = model.eval_model( "data/eval_clean.csv")
def model_predict(text): ''' Takes in an array of text and returns predicted probability of risk. Input: text (arr): E.g. data[['content']] Output: pred (arr): returns label of 0 for low risk and 1 for high risk based on prob_risk prob_risk (arr): E.g. data['probability_risk'] = model_predict(data[['content']]) pred_risk (arr): Risk score for each article ''' #read text file to get model path model_txt = open("../automation/curr_model.txt", "r") model_path = model_txt.read() model_txt.close() # loading saved model, specifying same args as model init # model names: path to directory containing model files # model naming convention : roberta_YYYY_MM model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5) model = ClassificationModel(model_type = 'roberta', model_name = model_path, \ args = model_args, use_cuda = False) # Preprocess text processed_text = text.apply( lambda x: text_processing(x, lower=False, remove_url=True, remove_punctuation=False, remove_stopwords=False, replace_entity=True, replace_hash=True, split_alphanumeric=False, lemmatize=False, stem=False)) # predict pred, raw_outputs = model.predict(text) # convert to probability of risk prob = softmax(raw_outputs, axis=1) prob_risk = [x[1] for x in prob] pred_risk = [predicted_risk(x) for x in prob_risk] return pred, prob_risk, pred_risk
def make_baseline_estimator(config, train_data, val_data): model_args = ClassificationArgs( num_train_epochs=config.num_epoch, output_dir=config.output_dir, overwrite_output_dir=True, max_seq_length=config.max_length, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size ) model = ClassificationModel( config.model_type, config.model_name, num_labels=config.num_label, use_cuda=config.device != 'cpu', args=model_args ) model.train_model(train_df=train_data, eval_df=val_data) return model
def train( arch, model_name, ): model_args = ClassificationArgs( num_train_epochs=5, output_dir="./models", evaluate_during_training_steps=1000, train_batch_size=64, reprocess_input_data=True, evaluate_during_training=True, eval_batch_size=32, save_model_every_epoch=False, overwrite_output_dir=True, learning_rate=7e-5, save_eval_checkpoints=False, best_model_dir=f"./models/{model_name}/best_model", use_early_stopping=True, early_stopping_delta=1e-2, early_stopping_metric="mcc", tensorboard_dir='./runs/', early_stopping_metric_minimize=False, wandb_project='my_roberta', manual_seed=69, early_stopping_patience=5, ) model = ClassificationModel(arch, model_name, args=model_args, use_cuda=True) model.train_model( train_df, eval_df=test, accuracy=lambda x, y: accuracy_score(x, [round(a) for a in y]), ) result, model_output, top_loss = model.eval_model(test) print(result) print(top_loss) pred, _ = model.predict(["thanks for bearing with us"]) print(pred)
def transformer2(model_name, train_df, eval_df, epochs, labels): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model_args = ClassificationArgs() model_args.num_train_epochs=epochs model_args.labels_list = [0, 1, 2, 3, 4] model_args.reprocess_input_data=True model_args.overwrite_output_dir=True model = ClassificationModel(model_name, 'bert-base-cased', num_labels=labels, args=model_args) # You can set class weights by using the optional weight argument # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(eval_df) # You can set class weights by using the optional weight argument return model, result, model_outputs, wrong_predictions
logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) categories = ["area", "assignee"] for category in categories: test_df, train_df, target_names = load_dataframes(category) model_args = ClassificationArgs( output_dir=category + "_model", best_model_dir=category + "_model_best", overwrite_output_dir=True, train_batch_size=16, eval_batch_size=32, max_seq_length=256, num_train_epochs=2, save_model_every_epoch=False, save_eval_checkpoints=False, ) def f1_multiclass(labels, preds): return f1_score(labels, preds, average="micro") # Create a ClassificationModel model = ClassificationModel( "bert", "finetuned", num_labels=len(target_names), args=model_args,
def cross_validation(lang): print(lang) model_name = 'EMBEDDIA/crosloengual-bert' logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger('transformers') transformers_logger.setLevel(logging.WARNING) # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. # If the Dataframe has a header, it should contain a 'text' and a 'labels' column. # If no header is present, the Dataframe should contain at least two columns, # with the first column is the text with type str, and the second column in the label with type int. accs = [] f1s = [] df = load_single_lang(lang) kf = KFold(n_splits=10) for train_index, test_index in kf.split(df.index): df_train = df.iloc[train_index] df_test = df.iloc[test_index] # hyperparameters model_args = ClassificationArgs() model_args.logging_steps = 1000000 model_args.save_eval_checkpoints = False model_args.save_steps = 1000000 model_args.no_cache = True model_args.save_model_every_epoch = False model_args.num_train_epochs = 1 model_args.learning_rate = 2e-4 model_args.train_batch_size = 32 model_args.overwrite_output_dir = True ''' model_args.train_custom_parameters_only = True model_args.custom_parameter_groups = [ { "params": ["classifier.weight"], "lr": 2e-4, }, { "params": ["classifier.bias"], "lr": 2e-4, "weight_decay": 0.0, }, ] ''' # Create a ClassificationModel model = ClassificationModel('bert', model_name, num_labels=3, args=model_args) print(model.get_named_parameters()) # Train the model print('Training ...') model.train_model(df_train) # Evaluate the model print('Evaluating ...') predictions, raw_outputs = model.predict(df_test['text'].values) out = eval(df_test['labels'].values, predictions) accs.append(out['acc']) f1s.append(out['avg_f1']) del model # write results to file with open('results_csebert.txt', 'a+') as f: f.write("{} {} {}\n".format(lang, statistics.mean(accs), statistics.mean(f1s)))
eval_df["text"] = xtest.cleaned eval_df["labels"] = xtest.category if TEST: eval_df = eval_df[0:SAMPLE] train_df = eval_df[0:SAMPLE] print("Defining model") # Optional model configuration model_args = ClassificationArgs( num_train_epochs=EPOCH, no_save=True, overwrite_output_dir=True, save_eval_checkpoints=False, save_model_every_epoch=False, save_optimizer_and_scheduler=False, max_seq_length=LENGTH, fp16=True, train_batch_size=BATCH, eval_batch_size=BATCH, ) # Create a ClassificationModel model = ClassificationModel(FAMILY, FAMILYMODEL, num_labels=len(eval_df.labels.unique()), args=model_args) print("Model training") # Train the model model.train_model(train_df)
def main(source=source, data_dir='data', checkpoint_dir="outputs/eval2/openai_finetune_1", best_model_dir='outputs/eval2/best_model_openai_finetune_1', model_name="roberta-large-openai-detector", n_train=300000, n_valid=8000, n_test=10000, n_epochs=5, learning_rate=1e-06, train_batch_size=64, eval_batch_size=64, evaluate_during_training=True, evaluate_during_training_steps=400, reprocess_input=True, overwrite_output_dir=True, n_gpu=2): print( f'{source}\n{data_dir}\n{checkpoint_dir}\n{best_model_dir}\n{model_name}\n{n_train}\n{n_valid}\n{n_test}\n{n_epochs}\n{learning_rate}\n{train_batch_size}\n{eval_batch_size}\n{evaluate_during_training}\n{evaluate_during_training_steps}\n{reprocess_input}\n{overwrite_output_dir}\n{n_gpu}\n' ) # import pdb; pdb.set_trace() train_df = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_df = data_loading.load_split(data_dir, source_test, 'valid', n=n_valid) test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test) # Optional model configuration model_args = ClassificationArgs( num_train_epochs=n_epochs, evaluate_during_training=evaluate_during_training, evaluate_during_training_steps=evaluate_during_training_steps, best_model_dir=best_model_dir, manual_seed=0, train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, overwrite_output_dir=overwrite_output_dir, n_gpu=n_gpu, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, learning_rate=learning_rate) # Create a ClassificationModel model = ClassificationModel("roberta", model_name=model_name, args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
def main( source=source, data_dir='data', checkpoint_dir="outputs/" + experiment_name, n_train=np.inf, n_valid=5000, n_epochs=10, n_test=np.inf, reprocess_input=True, small=True, ): train_texts, train_labels = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_texts, valid_labels = data_loading.load_split(data_dir, source_test, 'test', n=n_valid) test_texts, test_labels = data_loading.load_split(data_dir, source_test, 'test', n=n_test) for i, text in enumerate(train_texts): for key in ['Article: ', 'Body: ', 'Abstract: ']: if key in text: train_texts[i] = text.split(key)[-1] train_labels = [int(not label) for label in train_labels] for i, text in enumerate(valid_texts): for key in ['Article: ', 'Body: ', 'Abstract: ']: if key in text: valid_texts[i] = text.split(key)[-1] valid_labels = [int(not label) for label in valid_labels] for i, text in enumerate(test_texts): for key in ['Article: ', 'Body: ', 'Abstract: ']: if key in text: test_texts[i] = text.split(key)[-1] test_labels = [int(not label) for label in test_labels] def sample_sequences(texts, labels): small_texts = [] small_labels = [] for text, label in zip(texts, labels): toks = text.split() for seq_len in [16, 32, 64, 128, 256]: if len(toks) > seq_len: start_idx = random.randrange(len(toks) - seq_len) subseq = toks[start_idx:start_idx + seq_len] small_texts.append(" ".join(subseq)) small_labels.append(label) # import pdb; pdb.set_trace() all_texts = texts + small_texts all_labels = labels + small_labels return all_texts, all_labels if small: train_texts, train_labels = sample_sequences(train_texts, train_labels) # Preparing train data train_data = {'text': train_texts, 'labels': train_labels} train_df = pd.DataFrame(data=train_data) # Preparing eval data valid_data = {'text': valid_texts, 'labels': valid_labels} valid_df = pd.DataFrame(data=valid_data) # Preparing test data test_data = {'text': test_texts, 'labels': test_labels} test_df = pd.DataFrame(data=test_data) # Optional model configuration model_args = ClassificationArgs(num_train_epochs=n_epochs, evaluate_during_training=True, manual_seed=0, train_batch_size=16, eval_batch_size=32, overwrite_output_dir=True, n_gpu=2, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, cache_dir="cache_dir/" + experiment_name, best_model_dir='outputs/best_model_' + experiment_name, max_seq_length=256) # Create a ClassificationModel model = ClassificationModel("roberta", "roberta-large", args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
def bert_training(model_type, model_base, train_data, early_stop, early_stop_delta, overwrite, epoch, batch_size, learning_rate, output): # Bringing in the training data with open(train_data, 'r') as json_file: json_list = list(json_file) for json_str in json_list: train.append(json.loads(json_str)) # Data cleaning train_labels = [train[i]['label'] for i in range(len(train))] train_response = [ remove_stopwords(convert_emojis(train[i]['response'])) for i in range(len(train)) ] # Split data into training and test sets labels_train, labels_test, response_train, response_test = train_test_split( train_labels, train_response, test_size=0.2, random_state=42) # Convert SARCASM/NO SARCASM labels into 1s and 0s labels_train_pd = (pd.DataFrame(labels_train) == 'SARCASM').astype(int) labels_test_pd = (pd.DataFrame(labels_test) == 'SARCASM').astype(int) response_train_pd = pd.DataFrame(response_train) response_test_pd = pd.DataFrame(response_test) train_bert = pd.DataFrame({ 'text': response_train_pd[0].replace(r'\n', ' ', regex=True), 'label': labels_train_pd[0] }) eval_bert = pd.DataFrame({ 'text': response_test_pd[0].replace(r'\n', ' ', regex=True), 'label': labels_test_pd[0] }) model_args = ClassificationArgs() model_args.use_early_stopping = early_stop model_args.early_stopping_delta = early_stop_delta model_args.overwrite_output_dir = overwrite model_args.num_train_epochs = epoch model_args.train_batch_size = batch_size model_args.learning_rate = learning_rate model_args.output_dir = output # Create a TransformerModel model = ClassificationModel(model_type, model_base, use_cuda=False, args=model_args) # Train the model model.train_model(train_bert) # Evaluate the model model.eval_model(eval_bert)
print(df.shape) eval_df = df.drop(balanced_train_df.index, axis=0) eval_df, test_df = train_test_split(eval_df, test_size=0.5) train_df = balanced_train_df # Optional model configuration model_args = ClassificationArgs( num_train_epochs=10, do_lower_case=True, overwrite_output_dir=True, output_dir=get_path_from_project_dir('sentence_relevance', 'trained_models'), best_model_dir=get_path_from_project_dir('sentence_relevance/best', 'trained_models'), save_model_every_epoch=False, save_eval_checkpoints=False, save_steps=-1, evaluate_during_training_verbose=True, evaluate_during_training=True, early_stopping_consider_epochs=True, use_early_stopping=True, early_stopping_patience=5, early_stopping_delta=5e-3) # Create a ClassificationModel model = ClassificationModel("bert", "bert-base-uncased", args=model_args, num_labels=len(set(df['labels']))) # Train the model
def hyperargs(): # type: () -> {} """ Builds different sets of arguments for the classifier. Must be the same for training and predicting. :return: the labeled arguments :rtype: {} """ retdict = {} for curwindow in [128, 64, 32, 256]: for curstride in [0.7, 0.8, 0.9]: accargs = ClassificationArgs() accargs.num_train_epochs = 5 accargs.fp16 = False accargs.overwrite_output_dir = True accargs.evaluate_during_training = False accargs.sliding_window = True accargs.max_seq_length = curwindow accargs.stride = curstride accargs.labels_list = [1, 0] accargs.save_eval_checkpoints = False accargs.save_model_every_epoch = False accargs.silent = True accargs.manual_seed = 18 retdict['basic5epochs' + str(curwindow) + 'win' + str(int(curstride * 10.0)) + 'stride'] = accargs return retdict
def main ( sources=sources, data_dir='data', load_model_dir="outputs/analytic_checkpoint_v0", checkpoint_dir="outputs/hackathon_eval1", n_train=250000, n_valid=10000, n_test=np.inf, reprocess_input=False, ): # Optional model configuration model_args = ClassificationArgs( output_dir=checkpoint_dir, num_train_epochs=2, evaluate_during_training=True, save_steps=25000, evaluate_during_training_steps=25000, manual_seed=0, train_batch_size=256, eval_batch_size=256, overwrite_output_dir=True, reprocess_input_data=reprocess_input, n_gpu=1, no_cache=True, ) # Create a ClassificationModel model = ClassificationModel( "roberta", # load_model_dir, model_name="roberta-large-openai-detector", args=model_args, use_cuda=True ) for source in sources: print(source) test_texts, test_labels = data_loading.load_split(data_dir, source, 'test', n=n_test) for i, text in enumerate(test_texts): for key in ['Article: ', 'Body: ', 'Abstract: ']: if key in text: test_texts[i] = text.split(key)[-1] test_labels = [not label for label in test_labels] # Preparing test data test_data = { 'text':test_texts, 'labels':test_labels } test_df = pd.DataFrame(data=test_data) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer ) print(result) with open('hackathon_outputs/'+source+'.csv', 'w') as outfile: outfile.write(f'row_id, score\n') for line, output in enumerate(model_outputs): outfile.write(f'{line}, {output[1]}\n')
"max_seq_len": 512, "model": model_types[int(sys.argv[2])], "save": model_saves[int(sys.argv[2])] } df = pd.read_csv("data.csv") train_df = df.iloc[:wandb_config["samples"], :] train_df.columns = ["text", "labels"] eval_df = df.iloc[wandb_config["samples"]:, :] eval_df.columns = ["text", "labels"] model_args = ClassificationArgs() model_args.num_train_epochs = wandb_config["epochs"] model_args.eval_batch_size = wandb_config["eval_batch_size"] model_args.train_batch_size = wandb_config["train_batch_size"] model_args.wandb_project = "transformer-aes" model_args.wandb_kwargs = { "name": "{}-{}".format(wandb_config["model"], wandb_config["samples"]) } model_args.learning_rate = wandb_config["lr"] model_args.model = wandb_config["model"] model_args.samples = wandb_config["samples"] # model_args.max_seq_length = wandb_config["max_seq_length"] model_args.regression = True model_args.no_save = True model_args.overwrite_output_dir = True model_args.logging_steps = 1
"min_iter": 6, }, } sweep_id = wandb.sweep(sweep_config, project="RTE - Hyperparameter Optimization") logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Preparing train data train_df = load_rte_data_file("data/train.jsonl") eval_df = pd.read_json("data/eval_df", lines=True, orient="records") test_df = pd.read_json("data/test_df", lines=True, orient="records") model_args = ClassificationArgs() model_args.eval_batch_size = 8 model_args.evaluate_during_training = True model_args.evaluate_during_training_silent = False model_args.evaluate_during_training_steps = 1000 model_args.learning_rate = 4e-4 model_args.manual_seed = 4 model_args.max_seq_length = 256 model_args.multiprocessing_chunksize = 5000 model_args.no_cache = True model_args.no_save = True model_args.num_train_epochs = 10 model_args.overwrite_output_dir = True model_args.reprocess_input_data = True model_args.train_batch_size = 16 model_args.gradient_accumulation_steps = 2
def main( source=source, data_dir='data', checkpoint_dir="outputs/eval2/openai_finetune", n_train=120000, n_valid=5000, n_epochs=20, n_test=5000, reprocess_input=False, ): # import pdb; pdb.set_trace() train_texts, train_labels = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_texts, valid_labels = data_loading.load_split(data_dir, source_test, 'test', n=n_valid) test_texts, test_labels = data_loading.load_split(data_dir, source_test, 'test', n=n_test) # for i, text in enumerate(train_texts): # for key in ['Article: ', 'Body: ', 'Abstract: ']: # if key in text: # train_texts[i] = text.split(key)[-1] train_labels = [int(not label) for label in train_labels] # for i, text in enumerate(valid_texts): # for key in ['Article: ', 'Body: ', 'Abstract: ']: # if key in text: # valid_texts[i] = text.split(key)[-1] valid_labels = [int(not label) for label in valid_labels] # for i, text in enumerate(test_texts): # for key in ['Article: ', 'Body: ', 'Abstract: ']: # if key in text: # test_texts[i] = text.split(key)[-1] test_labels = [int(not label) for label in test_labels] # Preparing train data train_data = {'text': train_texts, 'labels': train_labels} train_df = pd.DataFrame(data=train_data) # Preparing eval data valid_data = {'text': valid_texts, 'labels': valid_labels} valid_df = pd.DataFrame(data=valid_data) # Preparing test data test_data = {'text': test_texts, 'labels': test_labels} test_df = pd.DataFrame(data=test_data) # Optional model configuration model_args = ClassificationArgs( num_train_epochs=n_epochs, evaluate_during_training=True, evaluate_during_training_steps=2000, best_model_dir='outputs/eval2/best_model_openai_finetune', manual_seed=0, train_batch_size=32, eval_batch_size=128, overwrite_output_dir=True, n_gpu=2, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, learning_rate=0.00001) # Create a ClassificationModel model = ClassificationModel("roberta", model_name="roberta-large-openai-detector", args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
df = pd.read_csv(r"./train.csv", header=None, names=["text", "labels"]) sentences_train = sentence_level_data_prep(df) df.reset_index(inplace=True) df.columns = ["ind", "text", "labels"] sentences_train.merge(original_train[["ind", "labels"]], on="ind", how="inner") sentences_train[ "sentence_length"] = sentences_train.sentences_from_abstract.map( lambda x: len(x.split())) sentences_train["label_text"] = pd.Categorical(sentences_train.labels) sentences_train["labels"] = sentences_train.label_text.cat.codes model_args = ClassificationArgs( num_train_epochs=10, sliding_window=True, fp16=False, use_early_stopping=True, reprocess_input_data=True, overwrite_output_dir=True, ) # Create a ClassificationModel model = ClassificationModel("roberta", "roberta-base", num_labels=7, args=model_args) # We train 4 models by selecting sentences above sent_len. We save these model for 10 epochs. At the end, we select best model from these 40 saved epoch models by selecting the one doing the best on the validation set. # for sent_len in [0, 6, 10, 15]: print(sent_len) sentences_train_filtred = sentences_train[(
}, } sweep_id = wandb.sweep(sweep_config, project="concepticon") model_args = ClassificationArgs( num_train_epochs=3, learning_rate=4e-6, no_cache=True if SWEEP else False, no_save=True if SWEEP else False, save_eval_checkpoints=False if SWEEP else True, save_model_every_epoch=False if SWEEP else True, overwrite_output_dir=True, reprocess_input_data=True, evaluate_during_training=True, evaluate_during_training_silent=False, evaluate_during_training_steps=1000, wandb_project="concepticon", train_batch_size=15, eval_batch_size=10, use_early_stopping=True, early_stopping_delta=0.01, early_stopping_metric="f1", early_stopping_metric_minimize=False, early_stopping_patience=5, ) def train(): wandb.init() model_args.wandb_kwargs = {"id": wandb.run.id}
) logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Preparing train data train_df = load_rte_data_file("data/train.jsonl") eval_df = pd.read_json("data/eval_df.jsonl", lines=True, orient="records") test_df = pd.read_json("data/test_df.jsonl", lines=True, orient="records") sweep_result = pd.read_csv("sweep_results/deep-sweep.csv") best_params = sweep_result.to_dict() model_args = ClassificationArgs() model_args.eval_batch_size = 32 model_args.evaluate_during_training = True model_args.evaluate_during_training_silent = False model_args.evaluate_during_training_steps = 1000 model_args.learning_rate = 4e-5 model_args.manual_seed = 4 model_args.max_seq_length = 256 model_args.multiprocessing_chunksize = 5000 model_args.no_cache = True # model_args.no_save = True model_args.num_train_epochs = 10 model_args.overwrite_output_dir = True model_args.reprocess_input_data = True model_args.train_batch_size = 16 model_args.gradient_accumulation_steps = 2
from simpletransformers.classification import ClassificationModel, ClassificationArgs # Path to the model. PATH = "/projects/tir5/users/apagnoni/gpt-2-output-dataset/outputs/eval2/best_model_openai_finetune_1" # Model configuration model_args = ClassificationArgs(eval_batch_size=1, no_cache=True) # Loading the model (can take some time). model = ClassificationModel( "roberta", PATH, args=model_args, use_cuda=True, ) def analytics(text): ''' inputs text: (string) outputs (llr, evidence) ''' predictions, raw_ouputs = model.predict([text]) llr = raw_ouputs[0][1] return (llr, None)
WANDB_PROJ_COMPLETE_DATA = "model_complete_data" WANDB_PROJ_AL_BASELINE = "model_al_baseline" WANDB_PROJ_AL_EXP = "model_al_experiments" # Model args for the simpletransformer model # Add or modify parameters based on experiment BEST_MODEL_SPEC_DIR = str(BEST_MODEL_DIR).format(WANDB_PROJ_AL_EXP) MODEL_ARGS = ClassificationArgs( num_train_epochs=5, overwrite_output_dir=True, train_batch_size=16, max_seq_length=250, # modify based on the experiment wandb_project=WANDB_PROJ_AL_EXP, best_model_dir=BEST_MODEL_SPEC_DIR, cache_dir=str(CACHE_DIR), eval_batch_size=16, evaluate_during_training=True, evaluate_during_training_verbose=True, manual_seed=100, output_dir=str(OUTPUT_DIR), use_early_stopping=True, early_stopping_patience=3, reprocess_input_data=True, ) # Model name (roberta-base, roberta-base-uncased, etc) MODEL_NAME = "roberta" MODEL_TYPE = "roberta-base" # Labels for classification LABELS = {
get_ipython().system(' pip install torchvision ') # In[4]: from simpletransformers.classification import ClassificationModel, ClassificationArgs import logging import torch import torchvision # set logggin messages logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Optional model configuration model_args = ClassificationArgs(num_train_epochs=1) # In[6]: # Create a ClassificationModel model = ClassificationModel( 'bert', 'bert-base-cased', num_labels=3, args=model_args, # args={'reprocess_input_data': True}, use_cuda=False, ) # Train the model model.train_model(train_df)
train_data = [ ["Aragorn was the heir of Isildur", "true"], ["Frodo was the heir of Isildur", "false"], ] train_df = pd.DataFrame(train_data) train_df.columns = ["text", "labels"] # Preparing eval data eval_data = [ ["Theoden was the king of Rohan", "true"], ["Merry was the king of Rohan", "false"], ] eval_df = pd.DataFrame(eval_data) eval_df.columns = ["text", "labels"] model_args = ClassificationArgs() model_args.reprocess_input_data = True model_args.overwrite_output_dir = True model_args.evaluate_during_training = True model_args.manual_seed = 4 model_args.use_multiprocessing = True model_args.train_batch_size = 16 model_args.eval_batch_size = 8 model_args.labels_list = ["true", "false"] model_args.wandb_project = "Simple Sweep" def train(): # Initialize a new wandb run wandb.init()
def buildbertargs(): # type: () -> ClassificationArgs """ Builds arguments for the classifier. Must be the same for training and predicting. :return: the arguments :rtype: ClassificationArgs """ accargs = ClassificationArgs() accargs.num_train_epochs = 5 accargs.fp16 = False accargs.overwrite_output_dir = True accargs.evaluate_during_training = False accargs.sliding_window = True accargs.max_seq_length = 256 accargs.stride = 0.9 accargs.labels_list = [1, 0] accargs.save_model_every_epoch = False accargs.silent = True accargs.manual_seed = 18 return accargs