def holdout_split_train_test(input_dataset_id: str, pc_hold_out: float = 0.10, random_seed = 888):
    input_file_name = f"{RootPath.get_dataset_path()}/{input_dataset_id}.csv.gz"
    train_file_name = f"{RootPath.get_dataset_path()}/cherry_train.csv.gz"
    val_file_name = f"{RootPath.get_dataset_path()}/cherry_val.csv.gz"


    train_file = gzip.open(train_file_name, "wb")
    test_file = gzip.open(val_file_name, "wb")

    x = get_feature("mapped_feature_engager_id", "holdout_new_train")
    y = x.groupby("mapped_feature_engager_id").size()
    a_0 = y[y == 0]
    a_1 = y[y == 1]
    a_2 = y[y == 2]
    a_3 = y[y == 3]

    line_counter = 0

    list_1 = x[x['mapped_feature_engager_id'].isin(set(a_1.sample(469531).index))].index.tolist()
    list_1 += x[x['mapped_feature_engager_id'].isin(set(a_2.sample(550118).index))].index.tolist()
    list_1 += x[x['mapped_feature_engager_id'].isin(set(a_3.sample(313919).index))].index.tolist()

    lines_to_val = set(list_1)

    r = random.Random(random_seed)

    r.random()
    with gzip.open(input_file_name, "rb") as file:
        for line in file:
            if r.random() >= pc_hold_out:
                if line_counter in lines_to_val:
                    a, b, c, d, e, f, g, h, timestamp, j = line.decode('utf-8').split("\x01", 9)
                    timestamp = int(timestamp)
                    timestamp += SECONDS_IN_A_WEEK
                    line = codecs.encode(a+"\x01"+b+"\x01"+c+"\x01"+d+"\x01"+e+"\x01"+f+"\x01"+g+"\x01"+h+"\x01"+str(timestamp)+"\x01"+j,encoding='utf-8')
                    test_file.write(line)
                else:
                    train_file.write(line)
                line_counter += 1
            else:
                if line_counter in lines_to_val:
                    print("Oh oh oopsie woopsie. this is not an error")
                a, b, c, d, e, f, g, h, timestamp, j = line.decode('utf-8').split("\x01", 9)
                timestamp = int(timestamp)
                timestamp += SECONDS_IN_A_WEEK
                line = codecs.encode(a+"\x01"+b+"\x01"+c+"\x01"+d+"\x01"+e+"\x01"+f+"\x01"+g+"\x01"+h+"\x01"+str(timestamp)+"\x01"+j,encoding='utf-8')
                test_file.write(line)
def main():
    '''
    feature_list = [
        "raw_feature_creator_follower_count",  # 0
                "raw_feature_creator_following_count",  # 1
                "raw_feature_engager_follower_count",  # 2
                "raw_feature_engager_following_count",  # 3
                "tweet_feature_number_of_photo",  # 4
                "tweet_feature_number_of_video",  # 5
                "tweet_feature_number_of_gif",  # 6
                "tweet_feature_number_of_hashtags",  # 7
                "tweet_feature_creation_timestamp_hour",  # 8
                "tweet_feature_creation_timestamp_week_day",  # 9
                "tweet_feature_number_of_mentions",  # 10
                "number_of_engagements_like", # 11
                "number_of_engagements_retweet", #  12
                "number_of_engagements_reply", # 13
                "number_of_engagements_comment", #  14
                "number_of_engagements_positive", #  15
                "number_of_engagements_negative", # 16
                "engager_feature_number_of_previous_like_engagement_ratio",  # 17
                "engager_feature_number_of_previous_reply_engagement_ratio",  # 18
                "engager_feature_number_of_previous_retweet_engagement_ratio",  # 19
                "engager_feature_number_of_previous_comment_engagement_ratio",  # 20
                "engager_feature_number_of_previous_positive_engagement_ratio",  # 21
                "engager_feature_number_of_previous_negative_engagement_ratio"  # 22
    ]
    '''
    '''
    feature_list = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified",
        "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo",
        "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif",
        "tweet_feature_number_of_media",
        "tweet_feature_is_retweet",
        "tweet_feature_is_quote",
        "tweet_feature_is_top_level",
        "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        #"tweet_feature_number_of_mentions",
        "tweet_feature_token_length",
        "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "number_of_engagements_like",
        "number_of_engagements_retweet",
        "number_of_engagements_reply",
        "number_of_engagements_comment",
        "number_of_engagements_negative",
        "number_of_engagements_positive",
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "adjacency_between_creator_and_engager_retweet",
        "adjacency_between_creator_and_engager_reply",
        "adjacency_between_creator_and_engager_comment",
        "adjacency_between_creator_and_engager_like",
        "adjacency_between_creator_and_engager_positive",
        "adjacency_between_creator_and_engager_negative",
        "graph_two_steps_adjacency_positive",
        "graph_two_steps_adjacency_negative",
        "graph_two_steps_adjacency_like",
        "graph_two_steps_adjacency_reply",
        "graph_two_steps_adjacency_retweet",
        "graph_two_steps_adjacency_comment",
        "graph_two_steps_positive",
        "graph_two_steps_negative",
        "graph_two_steps_like",
        "graph_two_steps_reply",
        "graph_two_steps_retweet",
        "graph_two_steps_comment"
    ]
    '''
    feature_list = [
        "raw_feature_creator_follower_count",  # 0
        "raw_feature_creator_following_count",  # 1
    ]

    print("Running on labels : like - retweet - reply - comment")

    ip = '34.242.41.76'
    submission_filename = "Dataset/Features/cherry_val/ensembling/nn_predictions"

    chunksize = 2048

    train_dataset = "cherry_train"
    test_dataset = "new_test"

    ffnn_params = {
        'hidden_size_1': 128,
        'hidden_size_2': 64,
        'hidden_dropout_prob_1': 0.5,
        'hidden_dropout_prob_2': 0.5
    }
    rec_params = {
        'epochs': 5,
        'weight_decay': 1e-5,
        'lr': 2e-5,
        'cap_length': 128,
        'ffnn_params': ffnn_params
    }

    saved_model_path = "./saved_models/saved_model_multi_label"

    rec = MultiDistilBertRec(**rec_params)

    train_df = get_dataset(features=feature_list, dataset_id=train_dataset)
    train_df = train_df.head(3840000)
    train_df = rec._normalize_features(train_df, is_train=True)

    ###   PREDICTION
    test_df = get_dataset(features=feature_list, dataset_id=test_dataset)
    #test_df = test_df.head(2500)

    prediction_start_time = time.time()

    text_test_reader_df = get_feature_reader(
        feature_name="raw_feature_tweet_text_token",
        dataset_id=test_dataset,
        chunksize=chunksize)
    predictions = rec.get_prediction(
        df_test_features=test_df,
        df_test_tokens_reader=text_test_reader_df,
        pretrained_model_dict_path=saved_model_path)
    print(f"Prediction time: {time.time() - prediction_start_time} seconds")

    print(predictions)
    print(predictions.shape)

    predictions_like = predictions[:, 0]
    predictions_retweet = predictions[:, 1]
    predictions_reply = predictions[:, 2]
    predictions_comment = predictions[:, 3]

    #print(predictions_like)
    #print(predictions_like.shape)

    tweets = get_feature("raw_feature_tweet_id",
                         test_dataset)["raw_feature_tweet_id"].array
    users = get_feature("raw_feature_engager_id",
                        test_dataset)["raw_feature_engager_id"].array

    #tweets = tweets.head(2500).array
    #users = users.head(2500).array

    create_submission_file(tweets, users, predictions_like,
                           submission_filename + "_like.csv")
    create_submission_file(tweets, users, predictions_like,
                           submission_filename + "_retweet.csv")
    create_submission_file(tweets, users, predictions_like,
                           submission_filename + "_reply.csv")
    create_submission_file(tweets, users, predictions_like,
                           submission_filename + "_comment.csv")
Пример #3
0
def main():
    '''
    feature_list = [
                "raw_feature_creator_follower_count",  # 0
                "raw_feature_creator_following_count",  # 1
                "raw_feature_engager_follower_count",  # 2
                "raw_feature_engager_following_count",  # 3
                "tweet_feature_number_of_photo",  # 4
                "tweet_feature_number_of_video",  # 5
                "tweet_feature_number_of_gif",  # 6
                "tweet_feature_number_of_hashtags",  # 7
                "tweet_feature_creation_timestamp_hour",  # 8
                "tweet_feature_creation_timestamp_week_day",  # 9
                "tweet_feature_number_of_mentions",  # 10
                "number_of_engagements_like", # 11
                "number_of_engagements_retweet", #  12
                "number_of_engagements_reply", # 13
                "number_of_engagements_comment", #  14
                "number_of_engagements_positive", #  15
                "number_of_engagements_negative", # 16
                "engager_feature_number_of_previous_like_engagement_ratio",  # 17
                "engager_feature_number_of_previous_reply_engagement_ratio",  # 18
                "engager_feature_number_of_previous_retweet_engagement_ratio",  # 19
                "engager_feature_number_of_previous_comment_engagement_ratio",  # 20
                "engager_feature_number_of_previous_positive_engagement_ratio",  # 21
                "engager_feature_number_of_previous_negative_engagement_ratio"  # 22
    ]
    '''
    '''
    feature_list = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified",
        "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo",
        "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif",
        "tweet_feature_number_of_media",
        "tweet_feature_is_retweet",
        "tweet_feature_is_quote",
        "tweet_feature_is_top_level",
        "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        #"tweet_feature_number_of_mentions",
        "tweet_feature_token_length",
        "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "number_of_engagements_like",
        "number_of_engagements_retweet",
        "number_of_engagements_reply",
        "number_of_engagements_comment",
        "number_of_engagements_negative",
        "number_of_engagements_positive",
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "adjacency_between_creator_and_engager_retweet",
        "adjacency_between_creator_and_engager_reply",
        "adjacency_between_creator_and_engager_comment",
        "adjacency_between_creator_and_engager_like",
        "adjacency_between_creator_and_engager_positive",
        "adjacency_between_creator_and_engager_negative",
        "graph_two_steps_adjacency_positive",
        "graph_two_steps_adjacency_negative",
        "graph_two_steps_adjacency_like",
        "graph_two_steps_adjacency_reply",
        "graph_two_steps_adjacency_retweet",
        "graph_two_steps_adjacency_comment",
        "graph_two_steps_positive",
        "graph_two_steps_negative",
        "graph_two_steps_like",
        "graph_two_steps_reply",
        "graph_two_steps_retweet",
        "graph_two_steps_comment"
    ]
    '''

    feature_list = [
        "raw_feature_creator_follower_count",  # 0
        "raw_feature_creator_following_count",  # 1
    ]

    chunksize = 192
    n_data_train = chunksize * 20000
    n_data_val = chunksize * 10000

    train_dataset = "cherry_train"
    val_dataset = "cherry_val"

    print("Running on labels : like - retweet - reply - comment")

    print(f"n_data_train: {n_data_train}")
    print(f"n_data_val: {n_data_val}")

    print(f"train_dataset: {train_dataset}")
    print(f"val_dataset: {val_dataset}")

    feature_train_df = get_dataset(features=feature_list, dataset_id=train_dataset)
    #   feature_train_df, _ = train_test_split(feature_train_df, train_size=0.2)
    feature_train_df = feature_train_df.head(n_data_train)

    like_df = get_feature(feature_name="tweet_feature_engagement_is_like", dataset_id=train_dataset)
    retweet_df = get_feature(feature_name="tweet_feature_engagement_is_retweet", dataset_id=train_dataset)
    reply_df = get_feature(feature_name="tweet_feature_engagement_is_reply", dataset_id=train_dataset)
    comment_df = get_feature(feature_name="tweet_feature_engagement_is_comment", dataset_id=train_dataset)
    label_train_df = pd.concat([like_df, retweet_df, reply_df, comment_df], axis=1)
    label_train_df = label_train_df.head(n_data_train)

    text_train_reader_df = get_feature_reader(feature_name="raw_feature_tweet_text_token", dataset_id=train_dataset,
                                              chunksize=chunksize)

    #    label_train_df, _ = train_test_split(label_train_df, train_size=0.2)

    feature_val_df = get_dataset(features=feature_list, dataset_id=val_dataset)
    feature_val_df = feature_val_df.head(n_data_val)

    like_df = get_feature(feature_name="tweet_feature_engagement_is_like", dataset_id=val_dataset)
    retweet_df = get_feature(feature_name="tweet_feature_engagement_is_retweet", dataset_id=val_dataset)
    reply_df = get_feature(feature_name="tweet_feature_engagement_is_reply", dataset_id=val_dataset)
    comment_df = get_feature(feature_name="tweet_feature_engagement_is_comment", dataset_id=val_dataset)
    label_val_df = pd.concat([like_df, retweet_df, reply_df, comment_df], axis=1)
    label_val_df = label_val_df.head(n_data_val)

    text_val_reader_df = get_feature_reader(feature_name="raw_feature_tweet_text_token", dataset_id=val_dataset,
                                            chunksize=chunksize)

    ffnn_params = {'hidden_size_1': 128, 'hidden_size_2': 64, 'hidden_dropout_prob_1': 0.5, 'hidden_dropout_prob_2': 0.5}
    rec_params = {'epochs': 5, 'weight_decay': 1e-5, 'lr': 2e-5, 'cap_length': 128, 'ffnn_params': ffnn_params}

    #print(f"ffnn_params: {ffnn_params}")
    print(f"bert_params: {rec_params}")

    rec = MultiDistilBertRec(**rec_params)

    ###   TRAINING
    stats = rec.fit(df_train_features=feature_train_df,
                df_train_tokens_reader=text_train_reader_df,
                df_train_label=label_train_df,
                df_val_features=feature_val_df,
                df_val_tokens_reader=text_val_reader_df,
                df_val_label=label_val_df,
                save_filename="multi_label"
                cat_feature_set=set([]),
                #subsample=0.1, # subsample percentage of each batch
                #pretrained_model_dict_path="saved_models/saved_model_yj_like_0.0001_774_128_64_0.1_0.1_epoch_5")
            )

    print("STATS: \n")
    print(stats)
    with open('stats.txt', 'w+') as f:
        for s in stats:
            f.write(str(s) + '\n')
Пример #4
0
def main(label_1, label_2, test_dataset, model_id):

    feature_list = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified", "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo", "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif", "tweet_feature_number_of_media",
        "tweet_feature_is_retweet", "tweet_feature_is_quote",
        "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        "tweet_feature_token_length", "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "number_of_engagements_like", "number_of_engagements_retweet",
        "number_of_engagements_reply", "number_of_engagements_comment",
        "number_of_engagements_negative", "number_of_engagements_positive",
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "adjacency_between_creator_and_engager_retweet",
        "adjacency_between_creator_and_engager_reply",
        "adjacency_between_creator_and_engager_comment",
        "adjacency_between_creator_and_engager_like",
        "adjacency_between_creator_and_engager_positive",
        "adjacency_between_creator_and_engager_negative",
        "graph_two_steps_adjacency_positive",
        "graph_two_steps_adjacency_negative", "graph_two_steps_adjacency_like",
        "graph_two_steps_adjacency_reply", "graph_two_steps_adjacency_retweet",
        "graph_two_steps_adjacency_comment", "graph_two_steps_positive",
        "graph_two_steps_negative", "graph_two_steps_like",
        "graph_two_steps_reply", "graph_two_steps_retweet",
        "graph_two_steps_comment"
    ]

    print(f"Model : {model_id}")
    print(f"Running on labels : {label_1} - {label_2}")

    ip = '34.242.41.76'
    submission_dir = f"Dataset/Features/{test_dataset}/ensembling"
    submission_filename = f"{submission_dir}/nn_predictions"

    training_chunksize = 192

    training_batches_number = 20000
    n_data_train = training_chunksize * training_batches_number

    test_chunksize = 2048

    train_dataset = "cherry_train"

    print(f"Test dataset : {test_dataset}")

    ffnn_params = {
        'hidden_size_1': 128,
        'hidden_size_2': 64,
        'hidden_dropout_prob_1': 0.5,
        'hidden_dropout_prob_2': 0.5
    }

    rec_params = {
        'epochs': 2,
        'weight_decay': 1e-5,
        'lr': 2e-5,
        'cap_length': 128,
        'ffnn_params': ffnn_params
    }

    saved_model_path = f"./saved_models/saved_model_{label_1}_{label_2}_{model_id}"

    rec = DualDistilBertRec(**rec_params)

    train_df = get_dataset(features=feature_list, dataset_id=train_dataset)

    if model_id == 1:
        train_df = train_df.head(n_data_train)
    elif model_id == 2:
        train_df = train_df.iloc[n_data_train:2 * n_data_train]

    train_df = rec._normalize_features(train_df, is_train=True)

    ###   PREDICTION
    test_df = get_dataset(features=feature_list, dataset_id=test_dataset)
    #test_df = test_df.head(2500)

    prediction_start_time = time.time()

    text_test_reader_df = get_feature_reader(
        feature_name="raw_feature_tweet_text_token",
        dataset_id=test_dataset,
        chunksize=test_chunksize)

    predictions = rec.get_prediction(
        df_test_features=test_df,
        df_test_tokens_reader=text_test_reader_df,
        pretrained_model_dict_path=saved_model_path)
    print(f"Prediction time: {time.time() - prediction_start_time} seconds")

    print(predictions)
    print(predictions.shape)

    p_1 = predictions[:, 0]
    p_2 = predictions[:, 1]

    tweets = get_feature("raw_feature_tweet_id",
                         test_dataset)["raw_feature_tweet_id"].array
    users = get_feature("raw_feature_engager_id",
                        test_dataset)["raw_feature_engager_id"].array

    #tweets = tweets.head(2500).array
    #users = users.head(2500).array

    pathlib.Path(submission_dir).mkdir(parents=True, exist_ok=True)

    create_submission_file(tweets, users, p_1,
                           submission_filename + f"_{label_1}_{model_id}.csv")
    create_submission_file(tweets, users, p_2,
                           submission_filename + f"_{label_2}_{model_id}.csv")
Пример #5
0
def main(class_label, model_id):

    feature_list_1 = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified", "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo", "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif", "tweet_feature_number_of_media",
        "tweet_feature_is_retweet", "tweet_feature_is_quote",
        "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        "tweet_feature_token_length", "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "number_of_engagements_like", "number_of_engagements_retweet",
        "number_of_engagements_reply", "number_of_engagements_comment",
        "number_of_engagements_negative", "number_of_engagements_positive",
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted"
    ]

    feature_list_2 = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified", "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo", "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif", "tweet_feature_number_of_media",
        "tweet_feature_is_retweet", "tweet_feature_is_quote",
        "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        "tweet_feature_token_length", "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "number_of_engagements_like", "number_of_engagements_retweet",
        "number_of_engagements_reply", "number_of_engagements_comment",
        "number_of_engagements_negative", "number_of_engagements_positive",
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "adjacency_between_creator_and_engager_retweet",
        "adjacency_between_creator_and_engager_reply",
        "adjacency_between_creator_and_engager_comment",
        "adjacency_between_creator_and_engager_like",
        "adjacency_between_creator_and_engager_positive",
        "adjacency_between_creator_and_engager_negative",
        "graph_two_steps_adjacency_positive",
        "graph_two_steps_adjacency_negative", "graph_two_steps_adjacency_like",
        "graph_two_steps_adjacency_reply", "graph_two_steps_adjacency_retweet",
        "graph_two_steps_adjacency_comment", "graph_two_steps_positive",
        "graph_two_steps_negative", "graph_two_steps_like",
        "graph_two_steps_reply", "graph_two_steps_retweet",
        "graph_two_steps_comment"
    ]

    chunksize = 192

    train_dataset = "cherry_train"
    val_dataset = "cherry_val"

    print(f"Training model : {model_id}")
    print(f"Running on label : {class_label}")

    if class_label == "comment":
        feature_list = feature_list_1
        train_batches_number = 10000
    elif class_label == "reply":
        feature_list = feature_list_2
        train_batches_number = 20000

    n_data_train = chunksize * train_batches_number

    val_batches_number = 10000
    n_data_val = chunksize * val_batches_number

    print(f"n_data_train: {n_data_train}")
    print(f"n_data_val: {n_data_val}")

    print(f"train_dataset: {train_dataset}")
    print(f"val_dataset: {val_dataset}")

    feature_train_df = get_dataset(features=feature_list,
                                   dataset_id=train_dataset)
    #   feature_train_df, _ = train_test_split(feature_train_df, train_size=0.2)

    label_train_df = get_feature(
        feature_name=f"tweet_feature_engagement_is_{class_label}",
        dataset_id=train_dataset)

    text_train_reader_df = get_feature_reader(
        feature_name="raw_feature_tweet_text_token",
        dataset_id=train_dataset,
        chunksize=chunksize)

    #    label_train_df, _ = train_test_split(label_train_df, train_size=0.2)

    feature_val_df = get_dataset(features=feature_list, dataset_id=val_dataset)

    label_val_df = get_feature(
        feature_name=f"tweet_feature_engagement_is_{class_label}",
        dataset_id=val_dataset)

    text_val_reader_df = get_feature_reader(
        feature_name="raw_feature_tweet_text_token",
        dataset_id=val_dataset,
        chunksize=chunksize)

    if model_id == 1:
        feature_train_df = feature_train_df.head(n_data_train)
        label_train_df = label_train_df.head(n_data_train)
        feature_val_df = feature_val_df.head(n_data_val)
        label_val_df = label_val_df.head(n_data_val)
    elif model_id == 2:
        feature_train_df = feature_train_df.iloc[n_data_train:2 * n_data_train]
        label_train_df = label_train_df.iloc[n_data_train:2 * n_data_train]
        feature_val_df = feature_val_df.iloc[n_data_val:2 * n_data_val]
        label_val_df = label_val_df.iloc[n_data_val:2 * n_data_val]

    ffnn_params = {
        'hidden_size_1': 128,
        'hidden_size_2': 64,
        'hidden_dropout_prob_1': 0.5,
        'hidden_dropout_prob_2': 0.5
    }

    rec_params = {
        'epochs': 1,
        'weight_decay': 1e-5,
        'lr': 2e-5,
        'cap_length': 128,
        'ffnn_params': ffnn_params,
        'class_label': class_label
    }

    #print(f"ffnn_params: {ffnn_params}")
    print(f"bert_params: {rec_params}")

    rec = DistilBertRec(**rec_params)

    ###   TRAINING
    if model_id == 1:
        stats = rec.fit(
            df_train_features=feature_train_df,
            df_train_tokens_reader=text_train_reader_df,
            df_train_label=label_train_df,
            df_val_features=feature_val_df,
            df_val_tokens_reader=text_val_reader_df,
            df_val_label=label_val_df,
            save_filename=f"{class_label}_{model_id}",
            cat_feature_set=set([]),
            #subsample=0.1, # subsample percentage of each batch
            #pretrained_model_dict_path="saved_models/saved_model_yj_like_0.0001_774_128_64_0.1_0.1_epoch_5"
        )
    elif model_id == 2:
        stats = rec.fit(
            df_train_features=feature_train_df,
            df_train_tokens_reader=text_train_reader_df,
            df_train_label=label_train_df,
            df_val_features=feature_val_df,
            df_val_tokens_reader=text_val_reader_df,
            df_val_label=label_val_df,
            save_filename=f"{class_label}_{model_id}",
            cat_feature_set=set([]),
            train_batches_to_skip=train_batches_number,
            val_batches_to_skip=val_batches_number
            #subsample=0.1, # subsample percentage of each batch
            #pretrained_model_dict_path="saved_models/saved_model_yj_like_0.0001_774_128_64_0.1_0.1_epoch_5"
        )

    print("STATS: \n")
    print(stats)
    with open('stats.txt', 'w+') as f:
        for s in stats:
            f.write(str(s) + '\n')