def get_max_seq_length(): train = pd.read_csv(TRAINING_DATA_PATH, sep='\t') dev = pd.read_csv(VALIDATION_DATA_PATH, sep='\t') model = ClassificationModel(MODEL_TYPE, MODEL_NAME, args=args, use_cuda=torch.cuda.is_available()) tokenizer = model.tokenizer train_max = tokenize_text(train["Text"], tokenizer) dev_max = tokenize_text(dev["Text"], tokenizer) print('training set max seq length: ', train_max) print('dev set max seq length: ', dev_max)
def run_hasoc_experiment(): if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) full = pd.read_csv(HASOC_DATA_PATH, sep='\t') le = LabelEncoder() train, test = train_test_split(full, test_size=0.2, random_state=SEED) train['label'] = le.fit_transform(train["task_1"]) train = train[['text', 'label']] train['text'] = train['text'].apply(lambda x: remove_names(x)) train['text'] = train['text'].apply(lambda x: remove_urls(x)) test['label'] = le.fit_transform(test["task_1"]) test = test[['text', 'label']] test['text'] = test['text'].apply(lambda x: remove_names(x)) test['text'] = test['text'].apply(lambda x: remove_urls(x)) # Create a ClassificationModel model = ClassificationModel( MODEL_TYPE, MODEL_NAME, args=hasoc_args, use_cuda=torch.cuda.is_available( )) # You can set class weights by using the optional weight argument # Train the model logging.info("Started Training") if hasoc_args["evaluate_during_training"]: train, eval_df = train_test_split(train, test_size=0.1, random_state=SEED) model.train_model(train, eval_df=eval_df, f1=sklearn.metrics.f1_score, accuracy=sklearn.metrics.accuracy_score) else: model.train_model(train, f1=sklearn.metrics.f1_score, accuracy=sklearn.metrics.accuracy_score) logging.info("Finished Training") # Evaluate the model test_sentences = test['text'].tolist() if hasoc_args["evaluate_during_training"]: model = ClassificationModel(MODEL_TYPE, hasoc_args["best_model_dir"], args=hasoc_args, use_cuda=torch.cuda.is_available()) predictions, raw_outputs = model.predict(test_sentences) test['predictions'] = predictions ( tn, fp, fn, tp ), accuracy, weighted_f1, macro_f1, weighted_recall, weighted_precision = evaluatation_scores( test, 'label', "predictions") test.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8') logging.info("Confusion Matrix (tn, fp, fn, tp) {} {} {} {}".format( tn, fp, fn, tp)) logging.info("Accuracy {}".format(accuracy)) logging.info("Weighted F1 {}".format(weighted_f1)) logging.info("Macro F1 {}".format(macro_f1)) logging.info("Weighted Recall {}".format(weighted_recall)) logging.info("Weighted Precision {}".format(weighted_precision)) return hasoc_args['best_model_dir']
train['label'] = le.fit_transform(train["subtask_a"]) train['text'] = train["tweet"] train = train[['text', 'label']] train['text'] = train['text'].apply(lambda x: transformer_pipeline(x)) dev['label'] = le.fit_transform(dev["subtask_a"]) dev['text'] = dev["tweet"] dev = dev[['text', 'label']] dev['text'] = dev['text'].apply(lambda x: transformer_pipeline(x)) test['text'] = test["tweet"] test['text'] = test['text'].apply(lambda x: transformer_pipeline(x)) model = ClassificationModel( MODEL_TYPE, MODEL_NAME, args=turkish_args, use_cuda=torch.cuda.is_available( )) # You can set class weights by using the optional weight argument # Train the model logging.info("Started Training") if turkish_args["evaluate_during_training"]: train, eval_df = train_test_split(train, test_size=0.1, random_state=SEED) model.train_model(train, eval_df=eval_df) else: model.train_model(train, f1=sklearn.metrics.f1_score, accuracy=sklearn.metrics.accuracy_score)
print("Started Training") dev_sentences = dev['text'].tolist() dev_preds = np.zeros((len(dev), args["n_fold"])) dev_raw_preds = [np.zeros((len(dev), args["n_fold"])) for i in range(N_CLASSES)] test_sentences = test['text'].tolist() test_preds = np.zeros((len(test), args["n_fold"])) test_raw_preds = [np.zeros((len(test), args["n_fold"])) for i in range(N_CLASSES)] if args["evaluate_during_training"]: for i in range(args["n_fold"]): if os.path.exists(args['output_dir']) and os.path.isdir(args['output_dir']): shutil.rmtree(args['output_dir']) print("Started Fold {}".format(i)) model = ClassificationModel(MODEL_TYPE, MODEL_NAME, args=args, use_cuda=torch.cuda.is_available()) # You can set class weights by using the optional weight argument train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, f1=f1, accuracy=sklearn.metrics.accuracy_score) model = ClassificationModel(MODEL_TYPE, args["best_model_dir"], args=args, use_cuda=torch.cuda.is_available()) predictions, raw_outputs = model.predict(dev_sentences) dev_preds[:, i] = predictions np_raw_output = np.array(raw_outputs) for j in range(N_CLASSES): dev_raw_preds[j][:, i] = np_raw_output[:, j] test_predictions, test_raw_outputs = model.predict(test_sentences) test_preds[:, i] = test_predictions np_test_raw_output = np.array(test_raw_outputs) for j in range(N_CLASSES):