def get_max_seq_length():
    train = pd.read_csv(TRAINING_DATA_PATH, sep='\t')
    dev = pd.read_csv(VALIDATION_DATA_PATH, sep='\t')

    model = ClassificationModel(MODEL_TYPE, MODEL_NAME, args=args, use_cuda=torch.cuda.is_available())
    tokenizer = model.tokenizer
    train_max = tokenize_text(train["Text"], tokenizer)
    dev_max = tokenize_text(dev["Text"], tokenizer)

    print('training set max seq length: ', train_max)
    print('dev set max seq length: ', dev_max)
def run_hasoc_experiment():
    if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY)

    full = pd.read_csv(HASOC_DATA_PATH, sep='\t')

    le = LabelEncoder()
    train, test = train_test_split(full, test_size=0.2, random_state=SEED)
    train['label'] = le.fit_transform(train["task_1"])
    train = train[['text', 'label']]
    train['text'] = train['text'].apply(lambda x: remove_names(x))
    train['text'] = train['text'].apply(lambda x: remove_urls(x))

    test['label'] = le.fit_transform(test["task_1"])
    test = test[['text', 'label']]
    test['text'] = test['text'].apply(lambda x: remove_names(x))
    test['text'] = test['text'].apply(lambda x: remove_urls(x))

    # Create a ClassificationModel
    model = ClassificationModel(
        MODEL_TYPE,
        MODEL_NAME,
        args=hasoc_args,
        use_cuda=torch.cuda.is_available(
        ))  # You can set class weights by using the optional weight argument

    # Train the model
    logging.info("Started Training")

    if hasoc_args["evaluate_during_training"]:
        train, eval_df = train_test_split(train,
                                          test_size=0.1,
                                          random_state=SEED)
        model.train_model(train,
                          eval_df=eval_df,
                          f1=sklearn.metrics.f1_score,
                          accuracy=sklearn.metrics.accuracy_score)

    else:
        model.train_model(train,
                          f1=sklearn.metrics.f1_score,
                          accuracy=sklearn.metrics.accuracy_score)

    logging.info("Finished Training")
    # Evaluate the model
    test_sentences = test['text'].tolist()

    if hasoc_args["evaluate_during_training"]:
        model = ClassificationModel(MODEL_TYPE,
                                    hasoc_args["best_model_dir"],
                                    args=hasoc_args,
                                    use_cuda=torch.cuda.is_available())

    predictions, raw_outputs = model.predict(test_sentences)

    test['predictions'] = predictions

    (
        tn, fp, fn, tp
    ), accuracy, weighted_f1, macro_f1, weighted_recall, weighted_precision = evaluatation_scores(
        test, 'label', "predictions")

    test.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE),
                header=True,
                sep='\t',
                index=False,
                encoding='utf-8')

    logging.info("Confusion Matrix (tn, fp, fn, tp) {} {} {} {}".format(
        tn, fp, fn, tp))
    logging.info("Accuracy {}".format(accuracy))
    logging.info("Weighted F1 {}".format(weighted_f1))
    logging.info("Macro F1 {}".format(macro_f1))
    logging.info("Weighted Recall {}".format(weighted_recall))
    logging.info("Weighted Precision {}".format(weighted_precision))

    return hasoc_args['best_model_dir']
train['label'] = le.fit_transform(train["subtask_a"])
train['text'] = train["tweet"]
train = train[['text', 'label']]
train['text'] = train['text'].apply(lambda x: transformer_pipeline(x))

dev['label'] = le.fit_transform(dev["subtask_a"])
dev['text'] = dev["tweet"]
dev = dev[['text', 'label']]
dev['text'] = dev['text'].apply(lambda x: transformer_pipeline(x))

test['text'] = test["tweet"]
test['text'] = test['text'].apply(lambda x: transformer_pipeline(x))

model = ClassificationModel(
    MODEL_TYPE,
    MODEL_NAME,
    args=turkish_args,
    use_cuda=torch.cuda.is_available(
    ))  # You can set class weights by using the optional weight argument

# Train the model
logging.info("Started Training")

if turkish_args["evaluate_during_training"]:
    train, eval_df = train_test_split(train, test_size=0.1, random_state=SEED)
    model.train_model(train, eval_df=eval_df)

else:
    model.train_model(train,
                      f1=sklearn.metrics.f1_score,
                      accuracy=sklearn.metrics.accuracy_score)
print("Started Training")

dev_sentences = dev['text'].tolist()
dev_preds = np.zeros((len(dev), args["n_fold"]))
dev_raw_preds = [np.zeros((len(dev), args["n_fold"])) for i in range(N_CLASSES)]

test_sentences = test['text'].tolist()
test_preds = np.zeros((len(test), args["n_fold"]))
test_raw_preds = [np.zeros((len(test), args["n_fold"])) for i in range(N_CLASSES)]

if args["evaluate_during_training"]:
    for i in range(args["n_fold"]):
        if os.path.exists(args['output_dir']) and os.path.isdir(args['output_dir']):
            shutil.rmtree(args['output_dir'])
        print("Started Fold {}".format(i))
        model = ClassificationModel(MODEL_TYPE, MODEL_NAME, args=args,
                                    use_cuda=torch.cuda.is_available())  # You can set class weights by using the optional weight argument
        train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
        model.train_model(train_df, eval_df=eval_df, f1=f1, accuracy=sklearn.metrics.accuracy_score)
        model = ClassificationModel(MODEL_TYPE, args["best_model_dir"], args=args,
                                    use_cuda=torch.cuda.is_available())

        predictions, raw_outputs = model.predict(dev_sentences)
        dev_preds[:, i] = predictions
        np_raw_output = np.array(raw_outputs)
        for j in range(N_CLASSES):
            dev_raw_preds[j][:, i] = np_raw_output[:, j]

        test_predictions, test_raw_outputs = model.predict(test_sentences)
        test_preds[:, i] = test_predictions
        np_test_raw_output = np.array(test_raw_outputs)
        for j in range(N_CLASSES):