Пример #1
0
def prediction(LGBM, dataset_id, df, label):

    tweets = Data.get_feature("raw_feature_tweet_id", dataset_id)["raw_feature_tweet_id"].array
    users = Data.get_feature("raw_feature_engager_id", dataset_id)["raw_feature_engager_id"].array

    # LGBM Prediction
    prediction_start_time = time.time()
    predictions = LGBM.get_prediction(df.to_numpy())
    print(f"Prediction time: {time.time() - prediction_start_time} seconds")

    # Uncomment to plot feature importance at the end of training
    # LGBM.plot_fimportance()

    create_submission_file(tweets, users, predictions, f"{dataset_id}_{label}_lgbm_blending_submission_2.csv")
def main():
    # Instantiate the parser
    parser = argparse.ArgumentParser()

    parser.add_argument('label', type=str, help='required argument: label')

    args = parser.parse_args()

    nn_labels = ["like", "reply", "retweet", "comment"]

    LABEL = args.label

    assert LABEL in ["like", "reply", "retweet", "comment"], "LABEL not valid."

    print(f"label is {LABEL}")

    features = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified",
        "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo",
        "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif",
        "tweet_feature_number_of_media",
        "tweet_feature_is_retweet",
        "tweet_feature_is_quote",
        "tweet_feature_is_top_level",
        "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        # "tweet_feature_number_of_mentions",
        "tweet_feature_token_length",
        "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "creator_feature_number_of_like_engagements_received",
        "creator_feature_number_of_retweet_engagements_received",
        "creator_feature_number_of_reply_engagements_received",
        "creator_feature_number_of_comment_engagements_received",
        "creator_feature_number_of_negative_engagements_received",
        "creator_feature_number_of_positive_engagements_received",
        "creator_feature_number_of_like_engagements_given",
        "creator_feature_number_of_retweet_engagements_given",
        "creator_feature_number_of_reply_engagements_given",
        "creator_feature_number_of_comment_engagements_given",
        "creator_feature_number_of_negative_engagements_given",
        "creator_feature_number_of_positive_engagements_given",
        "engager_feature_number_of_like_engagements_received",
        "engager_feature_number_of_retweet_engagements_received",
        "engager_feature_number_of_reply_engagements_received",
        "engager_feature_number_of_comment_engagements_received",
        "engager_feature_number_of_negative_engagements_received",
        "engager_feature_number_of_positive_engagements_received",
        "number_of_engagements_like",
        "number_of_engagements_retweet",
        "number_of_engagements_reply",
        "number_of_engagements_comment",
        "number_of_engagements_negative",
        "number_of_engagements_positive",
        "engager_feature_number_of_previous_like_engagement",
        "engager_feature_number_of_previous_reply_engagement",
        "engager_feature_number_of_previous_retweet_engagement",
        "engager_feature_number_of_previous_comment_engagement",
        "engager_feature_number_of_previous_positive_engagement",
        "engager_feature_number_of_previous_negative_engagement",
        "engager_feature_number_of_previous_engagement",
        "engager_feature_number_of_previous_like_engagement_ratio_1",
        "engager_feature_number_of_previous_reply_engagement_ratio_1",
        "engager_feature_number_of_previous_retweet_engagement_ratio_1",
        "engager_feature_number_of_previous_comment_engagement_ratio_1",
        "engager_feature_number_of_previous_positive_engagement_ratio_1",
        "engager_feature_number_of_previous_negative_engagement_ratio_1",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_engager",
        # "tweet_feature_number_of_previous_like_engagements",
        # "tweet_feature_number_of_previous_reply_engagements",
        # "tweet_feature_number_of_previous_retweet_engagements",
        # "tweet_feature_number_of_previous_comment_engagements",
        # "tweet_feature_number_of_previous_positive_engagements",
        # "tweet_feature_number_of_previous_negative_engagements",
        "creator_feature_number_of_previous_like_engagements_given",
        "creator_feature_number_of_previous_reply_engagements_given",
        "creator_feature_number_of_previous_retweet_engagements_given",
        "creator_feature_number_of_previous_comment_engagements_given",
        "creator_feature_number_of_previous_positive_engagements_given",
        "creator_feature_number_of_previous_negative_engagements_given",
        "creator_feature_number_of_previous_like_engagements_received",
        "creator_feature_number_of_previous_reply_engagements_received",
        "creator_feature_number_of_previous_retweet_engagements_received",
        "creator_feature_number_of_previous_comment_engagements_received",
        "creator_feature_number_of_previous_positive_engagements_received",
        "creator_feature_number_of_previous_negative_engagements_received",
        "engager_feature_number_of_previous_like_engagement_with_language",
        "engager_feature_number_of_previous_reply_engagement_with_language",
        "engager_feature_number_of_previous_retweet_engagement_with_language",
        "engager_feature_number_of_previous_comment_engagement_with_language",
        "engager_feature_number_of_previous_positive_engagement_with_language",
        "engager_feature_number_of_previous_negative_engagement_with_language",
        "engager_feature_knows_hashtag_positive",
        "engager_feature_knows_hashtag_negative",
        "engager_feature_knows_hashtag_like",
        "engager_feature_knows_hashtag_reply",
        "engager_feature_knows_hashtag_rt",
        "engager_feature_knows_hashtag_comment",
        "creator_and_engager_have_same_main_language",
        "is_tweet_in_creator_main_language",
        "is_tweet_in_engager_main_language",
        # "statistical_probability_main_language_of_engager_engage_tweet_language_1",
        # "statistical_probability_main_language_of_engager_engage_tweet_language_2",
        "creator_and_engager_have_same_main_grouped_language",
        "is_tweet_in_creator_main_grouped_language",
        "is_tweet_in_engager_main_grouped_language",
        # # "hashtag_similarity_fold_ensembling_positive",
        # # "link_similarity_fold_ensembling_positive",
        # # "domain_similarity_fold_ensembling_positive"
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted"
    ]

    label = [f"tweet_feature_engagement_is_{LABEL}"]

    train_dataset = "cherry_train"
    val_dataset = "cherry_val"
    test_dataset = "new_test"

    ensembling_list_dict = {
        'like': ['reply', 'retweet', 'comment'],
        'reply': ['reply', 'retweet', 'comment'],
        'retweet': ['reply', 'retweet', 'comment'],
        'comment': ['reply', 'retweet', 'comment'],
    }

    ensembling_list = ensembling_list_dict[LABEL]

    ensembling_lgbm_params = {}
    ensembling_xgb_params = {}
    for ens_label in ensembling_list:
        ensembling_lgbm_params[ens_label], ensembling_xgb_params[ens_label] \
            = params_by_label(ens_label)

    categorical_features_set = set([])

    # Load train data
    # loading_data_start_time = time.time()
    # df_train, df_train_label = Data.get_dataset_xgb(train_dataset, features, label)
    # print(f"Loading train data time: {loading_data_start_time - time.time()} seconds")

    # Load val data
    df_val, df_val_label = Data.get_dataset_xgb(val_dataset, features, label)

    # Load test data
    df_test = Data.get_dataset(features, test_dataset)

    new_index = pd.Series(df_test.index).map(lambda x: x + len(df_val))
    df_test.set_index(new_index, inplace=True)

    # df to be predicted by the lgbm blending feature
    df_to_predict = pd.concat([df_val, df_test])

    # BLENDING FEATURE DECLARATION

    feature_list = []

    # NEW CODE ADDED

    df_train = pd.DataFrame(columns=features)
    df_train_label = pd.DataFrame(columns=label)
    need_to_load_train_set = False

    for ens_label in ensembling_list:
        lgbm_params = ensembling_lgbm_params[ens_label]
        for lgbm_param_dict in lgbm_params:
            start_time = time.time()
            if not LGBMEnsemblingFeature(
                    dataset_id=train_dataset,
                    df_train=df_train,
                    df_train_label=get_ensembling_label(
                        ens_label, train_dataset),
                    df_to_predict=df_to_predict,
                    param_dict=lgbm_param_dict,
                    categorical_features_set=categorical_features_set
            ).has_feature():
                print(f"{ens_label} {lgbm_param_dict}")
                need_to_load_train_set = True

    if need_to_load_train_set:
        df_train, df_train_label = get_dataset_xgb_batch(
            total_n_split=1,
            split_n=0,
            dataset_id=train_dataset,
            X_label=features,
            Y_label=label,
            sample=0.3)

    for ens_label in ensembling_list:
        lgbm_params = ensembling_lgbm_params[ens_label]
        for lgbm_param_dict in lgbm_params:
            start_time = time.time()
            feature_list.append(
                LGBMEnsemblingFeature(
                    dataset_id=train_dataset,
                    df_train=df_train,
                    df_train_label=get_ensembling_label(
                        ens_label, train_dataset),
                    df_to_predict=df_to_predict,
                    param_dict=lgbm_param_dict,
                    categorical_features_set=categorical_features_set))

    # NEW PARTll
    # ONLY THIS PART IS NEW
    # LOAD THIS PART FIRST
    del df_train, df_train_label

    df_feature_list = [x.load_or_create() for x in feature_list]

    for ens_label in ensembling_list:
        start_time = time.time()
        if ens_label == "like":
            val_features_df = XGBFoldEnsemblingLike2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingLike2(
                test_dataset).load_or_create()
        elif ens_label == "retweet":
            val_features_df = XGBFoldEnsemblingRetweet2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingRetweet2(
                test_dataset).load_or_create()
        elif ens_label == "reply":
            val_features_df = XGBFoldEnsemblingReply2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingReply2(
                test_dataset).load_or_create()
        elif ens_label == "comment":
            val_features_df = XGBFoldEnsemblingComment2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingComment2(
                test_dataset).load_or_create()
        else:
            assert False, "oh oh something went wrong. label not found"

        test_features_df.set_index(new_index, inplace=True)

        xgb_feature_df = pd.concat([val_features_df, test_features_df])

        df_feature_list.append(xgb_feature_df)

        print(f"time: {time.time() - start_time}")

        del val_features_df, test_features_df

    # check dimensions
    len_val = len(df_val)

    for df_feat in df_feature_list:
        assert len(df_feat) == (len_val + len(df_test)), \
            f"Blending features are not of dimension expected, len val: {len_val} len test: {len(df_test)}\n " \
            f"obtained len: {len(df_feat)} of {df_feat.columns[0]}\n"

    # split feature dataframe in validation and testing
    df_feat_val_list = [df_feat.iloc[:len_val] for df_feat in df_feature_list]
    df_feat_test_list = [df_feat.iloc[len_val:] for df_feat in df_feature_list]

    df_feat_nn_val_list = [
        get_nn_prediction(l, val_dataset) for l in nn_labels
    ]

    df_feat_nn_test_list = [
        get_nn_prediction(l, test_dataset) for l in nn_labels
    ]
    for df_feat_nn_test in df_feat_nn_test_list:
        new_index = pd.Series(
            df_feat_nn_test.index).map(lambda x: x + len(df_val))
        df_feat_nn_test.set_index(new_index, inplace=True)

    df_feat_val_list += df_feat_nn_val_list
    df_feat_test_list += df_feat_nn_test_list

    df_val_to_be_concatenated_list = [df_val
                                      ] + df_feat_val_list + [df_val_label]
    df_test_to_be_concatenated_list = [df_test] + df_feat_test_list

    # creating the new validation set on which we will do meta optimization
    df_val = pd.concat(df_val_to_be_concatenated_list, axis=1)
    df_test = pd.concat(df_test_to_be_concatenated_list, axis=1)

    # now we are in full meta-model mode
    # watchout! they are unsorted now, you got to re-sort the dfs
    df_metatrain, df_metaval = train_test_split(df_val, test_size=0.3)
    df_metatrain.sort_index(inplace=True)
    df_metaval.sort_index(inplace=True)

    # split dataframe columns in train and label
    col_names_list = [df_feat.columns[0] for df_feat in df_feature_list]

    extended_features = df_test.columns
    df_metatrain_label = df_metatrain[label]
    df_metatrain = df_metatrain[extended_features]

    df_metaval_label = df_metaval[label]
    df_metaval = df_metaval[extended_features]

    for i in range(len(df_metatrain.columns)):
        assert df_metatrain.columns[i] == df_test.columns[i], f'You f****d yourself. metatrain col {i}: {df_metatrain.columns[i]}' \
                                                              f' test col {i}: {df_test.columns[i]}'

    model_name = "lightgbm_classifier"
    kind = LABEL

    params = {
        'num_leaves': 544,
        'max_depth': 7,
        'lambda_l1': 50.0,
        'lambda_l2': 2.841130937148593,
        'colsample_bynode': 0.4,
        'colsample_bytree': 1.0,
        'bagging_fraction': 1.0,
        'bagging_freq': 8,
        'min_data_in_leaf': 611,
    }

    LGBM = LightGBM(
        objective='binary',
        num_threads=-1,
        num_iterations=1000,
        early_stopping_rounds=15,
        **params,
    )

    # LGBM Training
    training_start_time = time.time()
    LGBM.fit(X=df_metatrain,
             Y=df_metatrain_label,
             X_val=df_metaval,
             Y_val=df_metaval_label,
             categorical_feature=set([]))
    print(f"Training time: {time.time() - training_start_time} seconds")

    # LGBM Evaluation
    evaluation_start_time = time.time()
    prauc, rce, conf, max_pred, min_pred, avg = LGBM.evaluate(
        df_metaval.to_numpy(), df_metaval_label.to_numpy())
    print(
        "since I'm lazy I did the local test on the same test on which I did EarlyStopping"
    )
    print(f"PRAUC:\t{prauc}")
    print(f"RCE:\t{rce}")
    print(f"TN:\t{conf[0, 0]}")
    print(f"FP:\t{conf[0, 1]}")
    print(f"FN:\t{conf[1, 0]}")
    print(f"TP:\t{conf[1, 1]}")
    print(f"MAX_PRED:\t{max_pred}")
    print(f"MIN_PRED:\t{min_pred}")
    print(f"AVG:\t{avg}")
    print(f"Evaluation time: {time.time() - evaluation_start_time} seconds")

    tweets = Data.get_feature("raw_feature_tweet_id",
                              test_dataset)["raw_feature_tweet_id"].array
    users = Data.get_feature("raw_feature_engager_id",
                             test_dataset)["raw_feature_engager_id"].array

    # LGBM Prediction
    prediction_start_time = time.time()
    predictions = LGBM.get_prediction(df_test.to_numpy())
    print(f"Prediction time: {time.time() - prediction_start_time} seconds")

    # Uncomment to plot feature importance at the end of training
    # LGBM.plot_fimportance()

    create_submission_file(tweets, users, predictions,
                           f"{LABEL}_lgbm_blending_submission.csv")
def main():
    '''
    feature_list = [
        "raw_feature_creator_follower_count",  # 0
                "raw_feature_creator_following_count",  # 1
                "raw_feature_engager_follower_count",  # 2
                "raw_feature_engager_following_count",  # 3
                "tweet_feature_number_of_photo",  # 4
                "tweet_feature_number_of_video",  # 5
                "tweet_feature_number_of_gif",  # 6
                "tweet_feature_number_of_hashtags",  # 7
                "tweet_feature_creation_timestamp_hour",  # 8
                "tweet_feature_creation_timestamp_week_day",  # 9
                "tweet_feature_number_of_mentions",  # 10
                "number_of_engagements_like", # 11
                "number_of_engagements_retweet", #  12
                "number_of_engagements_reply", # 13
                "number_of_engagements_comment", #  14
                "number_of_engagements_positive", #  15
                "number_of_engagements_negative", # 16
                "engager_feature_number_of_previous_like_engagement_ratio",  # 17
                "engager_feature_number_of_previous_reply_engagement_ratio",  # 18
                "engager_feature_number_of_previous_retweet_engagement_ratio",  # 19
                "engager_feature_number_of_previous_comment_engagement_ratio",  # 20
                "engager_feature_number_of_previous_positive_engagement_ratio",  # 21
                "engager_feature_number_of_previous_negative_engagement_ratio"  # 22
    ]
    '''
    '''
    feature_list = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified",
        "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo",
        "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif",
        "tweet_feature_number_of_media",
        "tweet_feature_is_retweet",
        "tweet_feature_is_quote",
        "tweet_feature_is_top_level",
        "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        #"tweet_feature_number_of_mentions",
        "tweet_feature_token_length",
        "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "number_of_engagements_like",
        "number_of_engagements_retweet",
        "number_of_engagements_reply",
        "number_of_engagements_comment",
        "number_of_engagements_negative",
        "number_of_engagements_positive",
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "adjacency_between_creator_and_engager_retweet",
        "adjacency_between_creator_and_engager_reply",
        "adjacency_between_creator_and_engager_comment",
        "adjacency_between_creator_and_engager_like",
        "adjacency_between_creator_and_engager_positive",
        "adjacency_between_creator_and_engager_negative",
        "graph_two_steps_adjacency_positive",
        "graph_two_steps_adjacency_negative",
        "graph_two_steps_adjacency_like",
        "graph_two_steps_adjacency_reply",
        "graph_two_steps_adjacency_retweet",
        "graph_two_steps_adjacency_comment",
        "graph_two_steps_positive",
        "graph_two_steps_negative",
        "graph_two_steps_like",
        "graph_two_steps_reply",
        "graph_two_steps_retweet",
        "graph_two_steps_comment"
    ]
    '''
    feature_list = [
        "raw_feature_creator_follower_count",  # 0
        "raw_feature_creator_following_count",  # 1
    ]

    print("Running on labels : like - retweet - reply - comment")

    ip = '34.242.41.76'
    submission_filename = "Dataset/Features/cherry_val/ensembling/nn_predictions"

    chunksize = 2048

    train_dataset = "cherry_train"
    test_dataset = "new_test"

    ffnn_params = {
        'hidden_size_1': 128,
        'hidden_size_2': 64,
        'hidden_dropout_prob_1': 0.5,
        'hidden_dropout_prob_2': 0.5
    }
    rec_params = {
        'epochs': 5,
        'weight_decay': 1e-5,
        'lr': 2e-5,
        'cap_length': 128,
        'ffnn_params': ffnn_params
    }

    saved_model_path = "./saved_models/saved_model_multi_label"

    rec = MultiDistilBertRec(**rec_params)

    train_df = get_dataset(features=feature_list, dataset_id=train_dataset)
    train_df = train_df.head(3840000)
    train_df = rec._normalize_features(train_df, is_train=True)

    ###   PREDICTION
    test_df = get_dataset(features=feature_list, dataset_id=test_dataset)
    #test_df = test_df.head(2500)

    prediction_start_time = time.time()

    text_test_reader_df = get_feature_reader(
        feature_name="raw_feature_tweet_text_token",
        dataset_id=test_dataset,
        chunksize=chunksize)
    predictions = rec.get_prediction(
        df_test_features=test_df,
        df_test_tokens_reader=text_test_reader_df,
        pretrained_model_dict_path=saved_model_path)
    print(f"Prediction time: {time.time() - prediction_start_time} seconds")

    print(predictions)
    print(predictions.shape)

    predictions_like = predictions[:, 0]
    predictions_retweet = predictions[:, 1]
    predictions_reply = predictions[:, 2]
    predictions_comment = predictions[:, 3]

    #print(predictions_like)
    #print(predictions_like.shape)

    tweets = get_feature("raw_feature_tweet_id",
                         test_dataset)["raw_feature_tweet_id"].array
    users = get_feature("raw_feature_engager_id",
                        test_dataset)["raw_feature_engager_id"].array

    #tweets = tweets.head(2500).array
    #users = users.head(2500).array

    create_submission_file(tweets, users, predictions_like,
                           submission_filename + "_like.csv")
    create_submission_file(tweets, users, predictions_like,
                           submission_filename + "_retweet.csv")
    create_submission_file(tweets, users, predictions_like,
                           submission_filename + "_reply.csv")
    create_submission_file(tweets, users, predictions_like,
                           submission_filename + "_comment.csv")
Пример #4
0
def main(label_1, label_2, test_dataset, model_id):

    feature_list = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified", "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo", "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif", "tweet_feature_number_of_media",
        "tweet_feature_is_retweet", "tweet_feature_is_quote",
        "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        "tweet_feature_token_length", "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "number_of_engagements_like", "number_of_engagements_retweet",
        "number_of_engagements_reply", "number_of_engagements_comment",
        "number_of_engagements_negative", "number_of_engagements_positive",
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "adjacency_between_creator_and_engager_retweet",
        "adjacency_between_creator_and_engager_reply",
        "adjacency_between_creator_and_engager_comment",
        "adjacency_between_creator_and_engager_like",
        "adjacency_between_creator_and_engager_positive",
        "adjacency_between_creator_and_engager_negative",
        "graph_two_steps_adjacency_positive",
        "graph_two_steps_adjacency_negative", "graph_two_steps_adjacency_like",
        "graph_two_steps_adjacency_reply", "graph_two_steps_adjacency_retweet",
        "graph_two_steps_adjacency_comment", "graph_two_steps_positive",
        "graph_two_steps_negative", "graph_two_steps_like",
        "graph_two_steps_reply", "graph_two_steps_retweet",
        "graph_two_steps_comment"
    ]

    print(f"Model : {model_id}")
    print(f"Running on labels : {label_1} - {label_2}")

    ip = '34.242.41.76'
    submission_dir = f"Dataset/Features/{test_dataset}/ensembling"
    submission_filename = f"{submission_dir}/nn_predictions"

    training_chunksize = 192

    training_batches_number = 20000
    n_data_train = training_chunksize * training_batches_number

    test_chunksize = 2048

    train_dataset = "cherry_train"

    print(f"Test dataset : {test_dataset}")

    ffnn_params = {
        'hidden_size_1': 128,
        'hidden_size_2': 64,
        'hidden_dropout_prob_1': 0.5,
        'hidden_dropout_prob_2': 0.5
    }

    rec_params = {
        'epochs': 2,
        'weight_decay': 1e-5,
        'lr': 2e-5,
        'cap_length': 128,
        'ffnn_params': ffnn_params
    }

    saved_model_path = f"./saved_models/saved_model_{label_1}_{label_2}_{model_id}"

    rec = DualDistilBertRec(**rec_params)

    train_df = get_dataset(features=feature_list, dataset_id=train_dataset)

    if model_id == 1:
        train_df = train_df.head(n_data_train)
    elif model_id == 2:
        train_df = train_df.iloc[n_data_train:2 * n_data_train]

    train_df = rec._normalize_features(train_df, is_train=True)

    ###   PREDICTION
    test_df = get_dataset(features=feature_list, dataset_id=test_dataset)
    #test_df = test_df.head(2500)

    prediction_start_time = time.time()

    text_test_reader_df = get_feature_reader(
        feature_name="raw_feature_tweet_text_token",
        dataset_id=test_dataset,
        chunksize=test_chunksize)

    predictions = rec.get_prediction(
        df_test_features=test_df,
        df_test_tokens_reader=text_test_reader_df,
        pretrained_model_dict_path=saved_model_path)
    print(f"Prediction time: {time.time() - prediction_start_time} seconds")

    print(predictions)
    print(predictions.shape)

    p_1 = predictions[:, 0]
    p_2 = predictions[:, 1]

    tweets = get_feature("raw_feature_tweet_id",
                         test_dataset)["raw_feature_tweet_id"].array
    users = get_feature("raw_feature_engager_id",
                        test_dataset)["raw_feature_engager_id"].array

    #tweets = tweets.head(2500).array
    #users = users.head(2500).array

    pathlib.Path(submission_dir).mkdir(parents=True, exist_ok=True)

    create_submission_file(tweets, users, p_1,
                           submission_filename + f"_{label_1}_{model_id}.csv")
    create_submission_file(tweets, users, p_2,
                           submission_filename + f"_{label_2}_{model_id}.csv")