Exemplo n.º 1
0
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        import Utils.Data.Data as data
        train_df = data.get_dataset([
            f"mapped_feature_creator_id", f"mapped_feature_engager_id",
            f"tweet_feature_engagement_is_{self._get_suffix()}"
        ], train_dataset_id)
        if is_test_or_val_set(self.dataset_id):
            test_df = data.get_dataset(
                [f"mapped_feature_creator_id", f"mapped_feature_engager_id"],
                test_dataset_id)
            train_df = train_df[
                train_df[f"tweet_feature_engagement_is_{self._get_suffix()}"]
                == True]
            res = compute(train_df, test_df)
            res.sort_index(inplace=True)
            self._save_test_result(res, test_dataset_id)
        else:
            # Compute the folds
            X_train_folds = np.array_split(train_df.sample(frac=1),
                                           self.number_of_folds)

            result = None

            for i in range(self.number_of_folds):
                local_train = pd.concat([
                    X_train_folds[x] for x in range(self.number_of_folds)
                    if x is not i
                ])
                local_train = local_train[local_train[
                    f"tweet_feature_engagement_is_{self._get_suffix()}"] ==
                                          True]
                local_test = X_train_folds[i]

                res = compute(local_train, local_test)

                if result is None:
                    result = res
                else:
                    result = pd.concat([result, res])

            self._save_train_result_if_not_present(result, train_dataset_id)
Exemplo n.º 2
0
def prediction(LGBM, dataset_id, df, label):

    tweets = Data.get_feature("raw_feature_tweet_id", dataset_id)["raw_feature_tweet_id"].array
    users = Data.get_feature("raw_feature_engager_id", dataset_id)["raw_feature_engager_id"].array

    # LGBM Prediction
    prediction_start_time = time.time()
    predictions = LGBM.get_prediction(df.to_numpy())
    print(f"Prediction time: {time.time() - prediction_start_time} seconds")

    # Uncomment to plot feature importance at the end of training
    # LGBM.plot_fimportance()

    create_submission_file(tweets, users, predictions, f"{dataset_id}_{label}_lgbm_blending_submission_2.csv")
Exemplo n.º 3
0
def get_ensembling_label(label, dataset_id):
    from Utils.Data import Data
    return Data.get_feature_batch(f"tweet_feature_engagement_is_{label}",
                                  dataset_id,
                                  total_n_split=1,
                                  split_n=0,
                                  sample=0.3)
    def create_feature(self):
        import Utils.Data.Data as data
        df = data.get_dataset([f"number_of_engagements_{self._get_suffix()}"],
                              self.dataset_id)
        support_df = data.get_dataset([
            f"number_of_engagements_positive",
            f"number_of_engagements_negative"
        ], self.dataset_id)

        df['total'] = support_df["number_of_engagements_positive"] + support_df[
            "number_of_engagements_negative"]
        result = pd.DataFrame(
            df[f"number_of_engagements_{self._get_suffix()}"] / df["total"])
        result.fillna(0, inplace=True)
        result.replace([np.inf, -np.inf], 0, inplace=True)
        self.save_feature(result)
    def fit(self, X=None, Y=None):

        #Tries to load X and Y if not directly passed
        if (X is None) or (Y is None):
            X, Y = Data.get_dataset_xgb_default_train()
            print("Train set loaded from file.")

        #Learning in a single round
        if self.batch is False:
            #Transforming matrices in DMatrix type
            train = xgb.DMatrix(X, label=Y)

            #Defining and fitting the models
            self.sround_model = xgb.train(self.get_param_dict(),
                                          dtrain=train,
                                          num_boost_round=math.ceil(
                                              self.num_rounds))

        #Learning by consecutive batches
        else:
            #Transforming matrices in DMatrix type
            train = xgb.DMatrix(X, label=Y)

            #Defining and training the model
            self.batch_model = xgb.train(self.get_param_dict(),
                                         dtrain=train,
                                         num_boost_round=math.ceil(
                                             self.num_rounds),
                                         xgb_model=self.batch_model)
    def evaluate(self, X_tst=None, Y_tst=None):
        Y_pred = None

        #Tries to load X and Y if not directly passed
        if (X_tst is None) or (Y_tst is None):
            X_tst, Y_tst = Data.get_dataset_xgb_default_test()
            print("Test set loaded from file.")
        #Y_tst = np.array(Y_tst[Y_tst.columns[0]].astype(float))
        if (self.sround_model is None) and (self.batch_model is None):
            print("No model trained yet.")
        else:
            #Selecting the coherent model for the evaluation
            #According to the initial declaration (batch/single round)
            if self.batch is False:
                model = self.sround_model
            else:
                model = self.batch_model

            #Preparing DMatrix
            #d_test = xgb.DMatrix(X_tst)
            #Making predictions
            #Y_pred = model.predict(d_test)
            Y_pred = self.get_prediction(X_tst)

            # Declaring the class containing the
            # metrics.
            cm = CoMe(Y_pred, Y_tst)

            #Evaluating
            scores = cm.compute_multiclass()

            return scores
    def get_popularity(self):
        import Utils.Data.Data as data
        if self.popularity_path.is_file():
            return np.load(self.popularity_path, allow_pickle=True)
        else:
            x = data.get_dataset(
                [
                    "mapped_feature_tweet_id",
                    "mapped_feature_tweet_hashtags",
                    "raw_feature_tweet_timestamp"
                ], self.dataset_id
            )

            x.columns = ["tweet", "hashtags", "time"]

            x = x.drop_duplicates("tweet")
            x = x.set_index('time', drop=True)
            x = x.sort_index()

            # Group size
            n = self.window_size
            # Overlapping size
            m = self.window_overlap

            chunks = [x[i:i + n] for i in range(0, len(x), n - m)]

            result = process_map(compute_chunk, chunks)
            s = [r[0] for r in result]
            y = data.get_dataset(
                [
                    "mapped_feature_tweet_id",
                    "mapped_feature_tweet_hashtags",
                    "raw_feature_tweet_timestamp"
                ], self.dataset_id
            )

            y.columns = ["tweet", "hashtags", "time"]
            get_popularity_partial = functools.partial(get_popularity, result=result, s=s)
            popularity = pd.concat(process_map(get_popularity_partial, np.array_split(y, 100)))
            self.popularity_path.parent.mkdir(parents=True, exist_ok=True)
            np.save(self.popularity_path, popularity, allow_pickle=True)
            return popularity
    def fit(self, X=None, Y=None, X_valid=None, Y_valid=None):

        # Tries to load X and Y if not directly passed
        if (X is None) or (Y is None):
            X, Y = Data.get_dataset_xgb_default_train()
            print("Train set loaded from file.")

        # In case validation set is not provided set early stopping rounds to default
        if (X_valid is None) or (Y_valid is None):
            self.early_stopping_rounds = None
            valid = []
        else:
            valid = xgb.DMatrix(X_valid, label=Y_valid)

        # Learning in a single round
        if self.batch is False:
            # Transforming matrices in DMatrix type
            train = xgb.DMatrix(X, label=Y)

            # Defining and fitting the models
            self.sround_model = xgb.train(
                self.get_param_dict(),
                early_stopping_rounds=self.early_stopping_rounds,
                evals=valid,
                dtrain=train,
                num_boost_round=math.ceil(self.num_rounds))

        # Learning by consecutive batches
        else:
            # Transforming matrices in DMatrix type
            train = xgb.DMatrix(X, label=Y)

            # if we want to start from a model already saved
            if os.path.exists(self.previous_model_path):
                # Defining and training the model
                model = xgb.train(
                    self.get_param_dict(),
                    early_stopping_rounds=self.early_stopping_rounds,
                    evals=valid,
                    dtrain=train,
                    xgb_model=self.previous_model_path)
                os.remove(self.previous_model_path)
                model.save_model(self.previous_model_path)
                del model

            # if we have no model saved
            else:
                model = xgb.train(
                    self.get_param_dict(),
                    early_stopping_rounds=self.early_stopping_rounds,
                    evals=valid,
                    dtrain=train)
                model.save_model(self.previous_model_path)
                del model
    def get_prediction(self, X_tst=None):
        Y_pred = None
        # Tries to load X and Y if not directly passed
        if (X_tst is None):
            X_tst, _ = Data.get_dataset_xgb_default_test()
            print("Test set loaded from file.")
        if (self.sround_model is None) and (self.batch_model is None):
            print("No model trained yet.")
        else:

            # Preparing DMatrix
            d_test = xgb.DMatrix(X_tst)

            model = self.get_model()

            # Making predictions
            Y_pred = model.predict(d_test)
            return Y_pred
    def get_prediction(self, X_tst=None):
        Y_pred = None
        #Tries to load X and Y if not directly passed
        if (X_tst is None):
            X_tst, _ = Data.get_dataset_xgb_default_test()
            print("Test set loaded from file.")
        if (self.sround_model is None) and (self.batch_model is None):
            print("No model trained yet.")
        else:
            #Selecting the coherent model for the evaluation
            #According to the initial declaration (batch/single round)
            if self.batch is False:
                model = self.sround_model
            else:
                model = self.batch_model

            #Preparing DMatrix
            d_test = xgb.DMatrix(X_tst)

            #Making predictions
            Y_pred = model.predict(d_test)
            return Y_pred
Exemplo n.º 11
0
    def evaluate(self, X_tst=None, Y_tst=None):
        Y_pred = None

        # Tries to load X and Y if not directly passed
        if (X_tst is None) or (Y_tst is None):
            X_tst, Y_tst = Data.get_dataset_xgb_default_test()
            print("Test set loaded from file.")
        Y_tst = np.array(Y_tst[Y_tst.columns[0]].astype(float))

        if (self.sround_model is None) and (not os.path.exists(
                self.previous_model_path)):
            print("No model trained yet.")
        else:
            # Selecting the coherent model for the evaluation
            # According to the initial declaration (batch/single round)
            model = self.get_model()

            # Preparing DMatrix
            # d_test = xgb.DMatrix(X_tst)
            # Making predictions
            # Y_pred = model.predict(d_test)
            Y_pred = self.get_prediction(X_tst)

            # Declaring the class containing the
            # metrics.
            cm = CoMe(Y_pred, Y_tst)

            # Evaluating
            prauc = cm.compute_prauc()
            rce = cm.compute_rce()
            # Confusion matrix
            conf = confMatrix(Y_tst, Y_pred)
            # Prediction stats
            max_pred = max(Y_pred)
            min_pred = min(Y_pred)
            avg = np.mean(Y_pred)

            return prauc, rce, conf, max_pred, min_pred, avg
Exemplo n.º 12
0
    def create_feature(self):

        # Check if the dataset id is train or test
        if not is_test_or_val_set(self.dataset_id):
            # Compute train and test dataset ids
            train_dataset_id = self.dataset_id

            # Load the dataset and shuffle it
            import Utils.Data.Data as data
            X_train = data.get_dataset(features=self.features,
                                       dataset_id=train_dataset_id,
                                       nthread=64)

            print(X_train)
            print(X_train.memory_usage())

            Y_train = data.get_dataset(features=self.label,
                                       dataset_id=train_dataset_id,
                                       nthread=64)

            print(Y_train)
            print(Y_train.memory_usage())

            # Declare list of scores (of each folds)
            # used for aggregating results
            scores = []
            kf = KFold(n_splits=4, shuffle=True, random_state=8)
            # Train multiple models with 1-fold out strategy
            for train_index, test_index in kf.split(X_train):
                train_index = np.random.choice(train_index,
                                               int(len(train_index) / 20),
                                               replace=True)
                local_X_train = X_train.iloc[train_index]
                local_Y_train = Y_train.iloc[train_index]

                # Compute the test set
                local_X_test = X_train.iloc[test_index]

                # Generate the dataset id for this fold
                fold_dataset_id = f"{self.feature_name}_{self.dataset_id}_fold_{len(scores)}"

                # Create the sub-feature
                feature = XGBEnsembling(fold_dataset_id, local_X_train,
                                        local_Y_train, local_X_test,
                                        self.param_dict)

                # Retrieve the scores
                scores.append(
                    pd.DataFrame(feature.load_or_create(),
                                 index=local_X_test.index))
                print(scores)

            # Compute the resulting dataframe and sort the results
            result = pd.concat(scores).sort_index()

            # Save it as a feature
            self.save_feature(result)

        else:
            test_dataset_id = self.dataset_id
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                test_dataset_id)
            # Load the train dataset
            import Utils.Data.Data as data
            X_train = data.get_dataset_batch(features=self.features,
                                             dataset_id=train_dataset_id,
                                             total_n_split=1,
                                             split_n=0,
                                             sample=0.05)
            Y_train = data.get_dataset_batch(features=self.label,
                                             dataset_id=train_dataset_id,
                                             total_n_split=1,
                                             split_n=0,
                                             sample=0.05)

            # Load the test dataset
            X_test = data.get_dataset(features=self.features,
                                      dataset_id=test_dataset_id,
                                      nthread=64)

            fold_dataset_id = f"{self.feature_name}_{self.dataset_id}"

            # Create the sub-feature
            feature = XGBEnsembling(fold_dataset_id, X_train, Y_train, X_test,
                                    self.param_dict)

            # Retrieve the scores
            result = pd.DataFrame(feature.load_or_create(), index=X_test.index)

            # Save it as a feature
            self.save_feature(result)
Exemplo n.º 13
0
def main():
    # Instantiate the parser
    parser = argparse.ArgumentParser()

    parser.add_argument('label', type=str, help='required argument: label')

    args = parser.parse_args()

    nn_labels = ["like", "reply", "retweet", "comment"]

    LABEL = args.label

    assert LABEL in ["like", "reply", "retweet", "comment"], "LABEL not valid."

    print(f"label is {LABEL}")

    features = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified",
        "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo",
        "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif",
        "tweet_feature_number_of_media",
        "tweet_feature_is_retweet",
        "tweet_feature_is_quote",
        "tweet_feature_is_top_level",
        "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        # "tweet_feature_number_of_mentions",
        "tweet_feature_token_length",
        "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "creator_feature_number_of_like_engagements_received",
        "creator_feature_number_of_retweet_engagements_received",
        "creator_feature_number_of_reply_engagements_received",
        "creator_feature_number_of_comment_engagements_received",
        "creator_feature_number_of_negative_engagements_received",
        "creator_feature_number_of_positive_engagements_received",
        "creator_feature_number_of_like_engagements_given",
        "creator_feature_number_of_retweet_engagements_given",
        "creator_feature_number_of_reply_engagements_given",
        "creator_feature_number_of_comment_engagements_given",
        "creator_feature_number_of_negative_engagements_given",
        "creator_feature_number_of_positive_engagements_given",
        "engager_feature_number_of_like_engagements_received",
        "engager_feature_number_of_retweet_engagements_received",
        "engager_feature_number_of_reply_engagements_received",
        "engager_feature_number_of_comment_engagements_received",
        "engager_feature_number_of_negative_engagements_received",
        "engager_feature_number_of_positive_engagements_received",
        "number_of_engagements_like",
        "number_of_engagements_retweet",
        "number_of_engagements_reply",
        "number_of_engagements_comment",
        "number_of_engagements_negative",
        "number_of_engagements_positive",
        "engager_feature_number_of_previous_like_engagement",
        "engager_feature_number_of_previous_reply_engagement",
        "engager_feature_number_of_previous_retweet_engagement",
        "engager_feature_number_of_previous_comment_engagement",
        "engager_feature_number_of_previous_positive_engagement",
        "engager_feature_number_of_previous_negative_engagement",
        "engager_feature_number_of_previous_engagement",
        "engager_feature_number_of_previous_like_engagement_ratio_1",
        "engager_feature_number_of_previous_reply_engagement_ratio_1",
        "engager_feature_number_of_previous_retweet_engagement_ratio_1",
        "engager_feature_number_of_previous_comment_engagement_ratio_1",
        "engager_feature_number_of_previous_positive_engagement_ratio_1",
        "engager_feature_number_of_previous_negative_engagement_ratio_1",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_engager",
        # "tweet_feature_number_of_previous_like_engagements",
        # "tweet_feature_number_of_previous_reply_engagements",
        # "tweet_feature_number_of_previous_retweet_engagements",
        # "tweet_feature_number_of_previous_comment_engagements",
        # "tweet_feature_number_of_previous_positive_engagements",
        # "tweet_feature_number_of_previous_negative_engagements",
        "creator_feature_number_of_previous_like_engagements_given",
        "creator_feature_number_of_previous_reply_engagements_given",
        "creator_feature_number_of_previous_retweet_engagements_given",
        "creator_feature_number_of_previous_comment_engagements_given",
        "creator_feature_number_of_previous_positive_engagements_given",
        "creator_feature_number_of_previous_negative_engagements_given",
        "creator_feature_number_of_previous_like_engagements_received",
        "creator_feature_number_of_previous_reply_engagements_received",
        "creator_feature_number_of_previous_retweet_engagements_received",
        "creator_feature_number_of_previous_comment_engagements_received",
        "creator_feature_number_of_previous_positive_engagements_received",
        "creator_feature_number_of_previous_negative_engagements_received",
        "engager_feature_number_of_previous_like_engagement_with_language",
        "engager_feature_number_of_previous_reply_engagement_with_language",
        "engager_feature_number_of_previous_retweet_engagement_with_language",
        "engager_feature_number_of_previous_comment_engagement_with_language",
        "engager_feature_number_of_previous_positive_engagement_with_language",
        "engager_feature_number_of_previous_negative_engagement_with_language",
        "engager_feature_knows_hashtag_positive",
        "engager_feature_knows_hashtag_negative",
        "engager_feature_knows_hashtag_like",
        "engager_feature_knows_hashtag_reply",
        "engager_feature_knows_hashtag_rt",
        "engager_feature_knows_hashtag_comment",
        "creator_and_engager_have_same_main_language",
        "is_tweet_in_creator_main_language",
        "is_tweet_in_engager_main_language",
        # "statistical_probability_main_language_of_engager_engage_tweet_language_1",
        # "statistical_probability_main_language_of_engager_engage_tweet_language_2",
        "creator_and_engager_have_same_main_grouped_language",
        "is_tweet_in_creator_main_grouped_language",
        "is_tweet_in_engager_main_grouped_language",
        # # "hashtag_similarity_fold_ensembling_positive",
        # # "link_similarity_fold_ensembling_positive",
        # # "domain_similarity_fold_ensembling_positive"
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted"
    ]

    label = [f"tweet_feature_engagement_is_{LABEL}"]

    train_dataset = "cherry_train"
    val_dataset = "cherry_val"
    test_dataset = "new_test"

    ensembling_list_dict = {
        'like': [],
        'reply': ['reply', 'retweet', 'comment'],
        'retweet': ['reply', 'retweet', 'comment'],
        'comment': ['reply', 'retweet', 'comment'],
    }

    ensembling_list = ensembling_list_dict[LABEL]

    ensembling_lgbm_params = {}
    ensembling_xgb_params = {}
    for ens_label in ensembling_list:
        ensembling_lgbm_params[ens_label], ensembling_xgb_params[ens_label]\
            = params_by_label(ens_label)

    categorical_features_set = set([])

    # Load train data
    # loading_data_start_time = time.time()
    # df_train, df_train_label = Data.get_dataset_xgb(train_dataset, features, label)
    # print(f"Loading train data time: {loading_data_start_time - time.time()} seconds")

    # Load val data
    df_val, df_val_label = Data.get_dataset_xgb(val_dataset, features, label)

    # Load test data
    df_test = Data.get_dataset(features, test_dataset)

    new_index = pd.Series(df_test.index).map(lambda x: x + len(df_val))
    df_test.set_index(new_index, inplace=True)

    # df to be predicted by the lgbm blending feature
    df_to_predict = pd.concat([df_val, df_test])

    # BLENDING FEATURE DECLARATION

    feature_list = []

    df_train = pd.DataFrame(columns=features)
    df_train_label = pd.DataFrame(columns=label)
    need_to_load_train_set = False

    for ens_label in ensembling_list:
        lgbm_params = ensembling_lgbm_params[ens_label]
        for lgbm_param_dict in lgbm_params:
            start_time = time.time()
            if not LGBMEnsemblingFeature(
                    dataset_id=train_dataset,
                    df_train=df_train,
                    df_train_label=get_ensembling_label(
                        ens_label, train_dataset),
                    df_to_predict=df_to_predict,
                    param_dict=lgbm_param_dict,
                    categorical_features_set=categorical_features_set
            ).has_feature():
                print(f"{ens_label} {lgbm_param_dict}")
                need_to_load_train_set = True

    if need_to_load_train_set:
        df_train, df_train_label = get_dataset_xgb_batch(
            total_n_split=1,
            split_n=0,
            dataset_id=train_dataset,
            X_label=features,
            Y_label=label,
            sample=0.3)

    for ens_label in ensembling_list:
        lgbm_params = ensembling_lgbm_params[ens_label]
        for lgbm_param_dict in lgbm_params:
            start_time = time.time()

            feature_list.append(
                LGBMEnsemblingFeature(
                    dataset_id=train_dataset,
                    df_train=df_train,
                    df_train_label=get_ensembling_label(
                        ens_label, train_dataset),
                    df_to_predict=df_to_predict,
                    param_dict=lgbm_param_dict,
                    categorical_features_set=categorical_features_set))
            print(f"time: {time.time()-start_time}")

    del df_train, df_train_label

    # NEW PARTll
    # ONLY THIS PART IS NEW
    # LOAD THIS PART FIRST

    df_feature_list = [x.load_or_create() for x in feature_list]

    for ens_label in ensembling_list:
        start_time = time.time()
        if ens_label == "like":
            val_features_df = XGBFoldEnsemblingLike2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingLike2(
                test_dataset).load_or_create()
        elif ens_label == "retweet":
            val_features_df = XGBFoldEnsemblingRetweet2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingRetweet2(
                test_dataset).load_or_create()
        elif ens_label == "reply":
            val_features_df = XGBFoldEnsemblingReply2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingReply2(
                test_dataset).load_or_create()
        elif ens_label == "comment":
            val_features_df = XGBFoldEnsemblingComment2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingComment2(
                test_dataset).load_or_create()
        else:
            assert False, "oh oh something went wrong. label not found"

        test_features_df.set_index(new_index, inplace=True)

        xgb_feature_df = pd.concat([val_features_df, test_features_df])

        df_feature_list.append(xgb_feature_df)

        print(f"time: {time.time() - start_time}")

        del val_features_df, test_features_df

    # check dimensions
    len_val = len(df_val)

    for df_feat in df_feature_list:
        assert len(df_feat) == (len_val + len(df_test)), \
            f"Blending features are not of dimension expected, len val: {len_val} len test: {len(df_test)}\n " \
            f"obtained len: {len(df_feat)} of {df_feat.columns[0]}\n"

    # split feature dataframe in validation and testing
    df_feat_val_list = [df_feat.iloc[:len_val] for df_feat in df_feature_list]
    #df_feat_test_list = [df_feat.iloc[len_val:] for df_feat in df_feature_list]

    df_feat_nn_val_list_1 = [
        get_nn_prediction(l, 1, val_dataset) for l in nn_labels
    ]
    df_feat_nn_val_list_2 = [
        get_nn_prediction(l, 2, val_dataset) for l in nn_labels
    ]

    df_feat_val_list += df_feat_nn_val_list_1 + df_feat_nn_val_list_2

    df_to_be_concatenated_list = [df_val] + df_feat_val_list + [df_val_label]

    # creating the new validation set on which we will do meta optimization
    df_val = pd.concat(df_to_be_concatenated_list, axis=1)

    # now we are in full meta-model mode
    # watchout! they are unsorted now, you got to re-sort the dfs
    df_metatrain, df_metaval = train_test_split(df_val, test_size=0.2)
    df_metatrain.sort_index(inplace=True)
    df_metaval.sort_index(inplace=True)

    # split dataframe columns in train and label
    col_names_list = [df_feat.columns[0] for df_feat in df_feature_list]

    extended_features = features + col_names_list
    df_metatrain_label = df_metatrain[label]
    df_metatrain = df_metatrain[extended_features]

    df_metaval_label = df_metaval[label]
    df_metaval = df_metaval[extended_features]

    model_name = "lightgbm_classifier"
    kind = LABEL

    OP = Optimizer(model_name,
                   kind,
                   mode=0,
                   path=LABEL,
                   path_log=f"blending-lgbm-{LABEL}-twonn-reg",
                   make_log=True,
                   make_save=False,
                   auto_save=False)

    OP.setParameters(n_calls=100, n_random_starts=30)
    OP.loadTrainData(df_metatrain, df_metatrain_label)

    OP.loadValData(df_metaval, df_metaval_label)  # early stopping

    OP.loadTestData(df_metaval, df_metaval_label)  # evaluate objective

    OP.setParamsLGB(objective='binary',
                    early_stopping_rounds=10,
                    eval_metric="binary",
                    is_unbalance=False)
    OP.setCategoricalFeatures(categorical_features_set)
    # OP.loadModelHardCoded()
    res = OP.optimize()
Exemplo n.º 14
0
def get_ensembling_label(label, dataset_id):
    from Utils.Data import Data
    return Data.get_feature(f"tweet_feature_engagement_is_{label}", dataset_id)
Exemplo n.º 15
0
def main():
    # Instantiate the parser
    parser = argparse.ArgumentParser()

    parser.add_argument('label', type=str, help='required argument: label')

    args = parser.parse_args()

    nn_labels = ["like", "reply", "retweet", "comment"]

    LABEL = args.label

    assert LABEL in ["like", "reply", "retweet", "comment"], "LABEL not valid."

    print(f"label is {LABEL}")

    features = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified",
        "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo",
        "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif",
        "tweet_feature_number_of_media",
        "tweet_feature_is_retweet",
        "tweet_feature_is_quote",
        "tweet_feature_is_top_level",
        "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        # "tweet_feature_number_of_mentions",
        "tweet_feature_token_length",
        "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "creator_feature_number_of_like_engagements_received",
        "creator_feature_number_of_retweet_engagements_received",
        "creator_feature_number_of_reply_engagements_received",
        "creator_feature_number_of_comment_engagements_received",
        "creator_feature_number_of_negative_engagements_received",
        "creator_feature_number_of_positive_engagements_received",
        "creator_feature_number_of_like_engagements_given",
        "creator_feature_number_of_retweet_engagements_given",
        "creator_feature_number_of_reply_engagements_given",
        "creator_feature_number_of_comment_engagements_given",
        "creator_feature_number_of_negative_engagements_given",
        "creator_feature_number_of_positive_engagements_given",
        "engager_feature_number_of_like_engagements_received",
        "engager_feature_number_of_retweet_engagements_received",
        "engager_feature_number_of_reply_engagements_received",
        "engager_feature_number_of_comment_engagements_received",
        "engager_feature_number_of_negative_engagements_received",
        "engager_feature_number_of_positive_engagements_received",
        "number_of_engagements_like",
        "number_of_engagements_retweet",
        "number_of_engagements_reply",
        "number_of_engagements_comment",
        "number_of_engagements_negative",
        "number_of_engagements_positive",
        "engager_feature_number_of_previous_like_engagement",
        "engager_feature_number_of_previous_reply_engagement",
        "engager_feature_number_of_previous_retweet_engagement",
        "engager_feature_number_of_previous_comment_engagement",
        "engager_feature_number_of_previous_positive_engagement",
        "engager_feature_number_of_previous_negative_engagement",
        "engager_feature_number_of_previous_engagement",
        "engager_feature_number_of_previous_like_engagement_ratio_1",
        "engager_feature_number_of_previous_reply_engagement_ratio_1",
        "engager_feature_number_of_previous_retweet_engagement_ratio_1",
        "engager_feature_number_of_previous_comment_engagement_ratio_1",
        "engager_feature_number_of_previous_positive_engagement_ratio_1",
        "engager_feature_number_of_previous_negative_engagement_ratio_1",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_engager",
        # "tweet_feature_number_of_previous_like_engagements",
        # "tweet_feature_number_of_previous_reply_engagements",
        # "tweet_feature_number_of_previous_retweet_engagements",
        # "tweet_feature_number_of_previous_comment_engagements",
        # "tweet_feature_number_of_previous_positive_engagements",
        # "tweet_feature_number_of_previous_negative_engagements",
        "creator_feature_number_of_previous_like_engagements_given",
        "creator_feature_number_of_previous_reply_engagements_given",
        "creator_feature_number_of_previous_retweet_engagements_given",
        "creator_feature_number_of_previous_comment_engagements_given",
        "creator_feature_number_of_previous_positive_engagements_given",
        "creator_feature_number_of_previous_negative_engagements_given",
        "creator_feature_number_of_previous_like_engagements_received",
        "creator_feature_number_of_previous_reply_engagements_received",
        "creator_feature_number_of_previous_retweet_engagements_received",
        "creator_feature_number_of_previous_comment_engagements_received",
        "creator_feature_number_of_previous_positive_engagements_received",
        "creator_feature_number_of_previous_negative_engagements_received",
        "engager_feature_number_of_previous_like_engagement_with_language",
        "engager_feature_number_of_previous_reply_engagement_with_language",
        "engager_feature_number_of_previous_retweet_engagement_with_language",
        "engager_feature_number_of_previous_comment_engagement_with_language",
        "engager_feature_number_of_previous_positive_engagement_with_language",
        "engager_feature_number_of_previous_negative_engagement_with_language",
        "engager_feature_knows_hashtag_positive",
        "engager_feature_knows_hashtag_negative",
        "engager_feature_knows_hashtag_like",
        "engager_feature_knows_hashtag_reply",
        "engager_feature_knows_hashtag_rt",
        "engager_feature_knows_hashtag_comment",
        "creator_and_engager_have_same_main_language",
        "is_tweet_in_creator_main_language",
        "is_tweet_in_engager_main_language",
        # "statistical_probability_main_language_of_engager_engage_tweet_language_1",
        # "statistical_probability_main_language_of_engager_engage_tweet_language_2",
        "creator_and_engager_have_same_main_grouped_language",
        "is_tweet_in_creator_main_grouped_language",
        "is_tweet_in_engager_main_grouped_language",
        # # "hashtag_similarity_fold_ensembling_positive",
        # # "link_similarity_fold_ensembling_positive",
        # # "domain_similarity_fold_ensembling_positive"
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted"
    ]

    label = [f"tweet_feature_engagement_is_{LABEL}"]

    train_dataset = "cherry_train"
    val_dataset = "cherry_val"
    test_dataset = "new_test"
    private_test_dataset = "last_test"

    ensembling_list_dict = {
        'like': ['reply', 'retweet', 'comment'],
        'reply': ['reply', 'retweet', 'comment'],
        'retweet': ['reply', 'retweet', 'comment'],
        'comment': ['reply', 'retweet', 'comment'],
    }

    ensembling_list = ensembling_list_dict[LABEL]

    ensembling_lgbm_params = {}
    ensembling_xgb_params = {}
    for ens_label in ensembling_list:
        ensembling_lgbm_params[ens_label], ensembling_xgb_params[ens_label] \
            = params_by_label(ens_label)

    categorical_features_set = set([])

    # Load train data
    # loading_data_start_time = time.time()
    # df_train, df_train_label = Data.get_dataset_xgb(train_dataset, features, label)
    # print(f"Loading train data time: {loading_data_start_time - time.time()} seconds")

    # Load val data
    df_val, df_val_label = Data.get_dataset_xgb(val_dataset, features, label)

    # Load test data
    df_test = Data.get_dataset(features, test_dataset)
    df_private = Data.get_dataset(features, private_test_dataset)

    new_index = pd.Series(df_test.index).map(lambda x: x + len(df_val))
    df_test.set_index(new_index, inplace=True)

    new_index_private = pd.Series(
        df_private.index).map(lambda x: x + len(df_val) + len(df_test))
    df_private.set_index(new_index_private, inplace=True)

    # df to be predicted by the lgbm blending feature
    df_to_predict = pd.concat([df_val, df_test, df_private])

    # BLENDING FEATURE DECLARATION

    feature_list = []

    # NEW CODE ADDED

    df_train = pd.DataFrame(columns=features)
    df_train_label = pd.DataFrame(columns=label)
    need_to_load_train_set = False

    for ens_label in ensembling_list:
        lgbm_params = ensembling_lgbm_params[ens_label]
        for lgbm_param_dict in lgbm_params:
            start_time = time.time()
            if not LGBMEnsemblingFeature(
                    dataset_id=private_test_dataset,
                    df_train=df_train,
                    df_train_label=get_ensembling_label(
                        ens_label, train_dataset),
                    df_to_predict=df_to_predict,
                    param_dict=lgbm_param_dict,
                    categorical_features_set=categorical_features_set
            ).has_feature():
                print(f"{ens_label} {lgbm_param_dict}")
                need_to_load_train_set = True

    if need_to_load_train_set:
        df_train, df_train_label = get_dataset_xgb_batch(
            total_n_split=1,
            split_n=0,
            dataset_id=train_dataset,
            X_label=features,
            Y_label=label,
            sample=0.3)

    for ens_label in ensembling_list:
        lgbm_params = ensembling_lgbm_params[ens_label]
        for lgbm_param_dict in lgbm_params:
            start_time = time.time()
            feature_list.append(
                LGBMEnsemblingFeature(
                    dataset_id=private_test_dataset,
                    df_train=df_train,
                    df_train_label=get_ensembling_label(
                        ens_label, train_dataset),
                    df_to_predict=df_to_predict,
                    param_dict=lgbm_param_dict,
                    categorical_features_set=categorical_features_set))

    # NEW PARTll
    # ONLY THIS PART IS NEW
    # LOAD THIS PART FIRST
    del df_train, df_train_label

    df_feature_list = [x.load_or_create() for x in tqdm(feature_list)]

    for ens_label in ensembling_list:
        start_time = time.time()
        if ens_label == "like":
            val_features_df = XGBFoldEnsemblingLike2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingLike2(
                test_dataset).load_or_create()
            private_features_df = XGBFoldEnsemblingLike2(
                private_test_dataset).load_or_create()
        elif ens_label == "retweet":
            val_features_df = XGBFoldEnsemblingRetweet2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingRetweet2(
                test_dataset).load_or_create()
            private_features_df = XGBFoldEnsemblingRetweet2(
                private_test_dataset).load_or_create()
        elif ens_label == "reply":
            val_features_df = XGBFoldEnsemblingReply2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingReply2(
                test_dataset).load_or_create()
            private_features_df = XGBFoldEnsemblingReply2(
                private_test_dataset).load_or_create()
        elif ens_label == "comment":
            val_features_df = XGBFoldEnsemblingComment2(
                val_dataset).load_or_create()
            test_features_df = XGBFoldEnsemblingComment2(
                test_dataset).load_or_create()
            private_features_df = XGBFoldEnsemblingComment2(
                private_test_dataset).load_or_create()
        else:
            assert False, "oh oh something went wrong. label not found"

        test_features_df.set_index(new_index, inplace=True)
        private_features_df.set_index(new_index_private, inplace=True)

        xgb_feature_df = pd.concat(
            [val_features_df, test_features_df, private_features_df])

        df_feature_list.append(xgb_feature_df)

        print(f"time: {time.time() - start_time}")

        del val_features_df, test_features_df, private_features_df

    # check dimensions
    len_val = len(df_val)
    len_test = len(df_test)
    len_private = len(df_private)

    for df_feat in df_feature_list:
        assert len(df_feat) == (len_val + len_test + len_private), \
            f"Blending features are not of dimension expected, len val: {len_val} len test: {len_test}" \
            f" len private test: {len_private}\n " \
            f"obtained len: {len(df_feat)} of {df_feat.columns[0]}\n"

    # split feature dataframe in validation and testing
    df_feat_val_list = [df_feat.iloc[:len_val] for df_feat in df_feature_list]
    df_feat_test_list = [
        df_feat.iloc[len_val:-len_private] for df_feat in df_feature_list
    ]
    df_feat_private_list = [
        df_feat.iloc[-len_private:] for df_feat in df_feature_list
    ]

    df_feat_nn_val_list = [
        get_nn_prediction(l, val_dataset) for l in nn_labels
    ]

    df_feat_nn_test_list = [
        get_nn_prediction(l, test_dataset) for l in nn_labels
    ]

    df_feat_nn_private_list = [
        get_nn_prediction(l, private_test_dataset) for l in nn_labels
    ]

    for df_feat_nn_test in df_feat_nn_test_list:
        new_index = pd.Series(
            df_feat_nn_test.index).map(lambda x: x + len(df_val))
        df_feat_nn_test.set_index(new_index, inplace=True)

    for df_feat_nn_private in df_feat_nn_private_list:
        new_index_private = pd.Series(df_feat_nn_private.index).map(
            lambda x: x + len(df_val) + len(df_test))
        df_feat_nn_private.set_index(new_index_private, inplace=True)

    df_feat_val_list += df_feat_nn_val_list
    df_feat_test_list += df_feat_nn_test_list
    df_feat_private_list += df_feat_nn_private_list

    df_val_to_be_concatenated_list = [df_val
                                      ] + df_feat_val_list + [df_val_label]
    df_test_to_be_concatenated_list = [df_test] + df_feat_test_list
    df_private_to_be_concatenated_list = [df_private] + df_feat_private_list

    # creating the new validation set on which we will do meta optimization
    df_val = pd.concat(df_val_to_be_concatenated_list, axis=1)
    df_test = pd.concat(df_test_to_be_concatenated_list, axis=1)
    df_private = pd.concat(df_private_to_be_concatenated_list, axis=1)

    # now we are in full meta-model mode
    # watchout! they are unsorted now, you got to re-sort the dfs
    df_metatrain, df_metaval = train_test_split(df_val,
                                                test_size=0.1,
                                                random_state=16 + 1)
    df_metatrain.sort_index(inplace=True)
    df_metaval.sort_index(inplace=True)

    # split dataframe columns in train and label
    col_names_list = [df_feat.columns[0] for df_feat in df_feature_list]

    extended_features = df_test.columns
    df_metatrain_label = df_metatrain[label]
    df_metatrain = df_metatrain[extended_features]

    df_metaval_label = df_metaval[label]
    df_metaval = df_metaval[extended_features]

    for i in range(len(df_metatrain.columns)):
        assert df_metatrain.columns[i] == df_test.columns[i], f'You f****d yourself. metatrain col {i}: {df_metatrain.columns[i]}' \
                                                              f' test col {i}: {df_test.columns[i]}'
        assert df_metatrain.columns[i] == df_private.columns[i], \
            f'You f****d yourself. metatrain col {i}: {df_metatrain.columns[i]} private test col {i}: {df_test.columns[i]}'

    model_name = "lightgbm_classifier"
    kind = LABEL

    params = {
        'num_leaves': 200.4606708311663,
        'learning_rate': 0.02250057258744298,
        'max_depth': 47,
        'lambda_l1': 3.037842501865099,
        'lambda_l2': 1.0,
        'colsample_bynode': 0.4,
        'colsample_bytree': 0.4,
        'bagging_fraction': 0.8,
        'bagging_freq': 10,
        'max_bin': 3344.071500013681,
        'min_data_in_leaf': 10.0
    }

    LGBM = LightGBM(
        objective='binary',
        num_threads=-1,
        num_iterations=1000,
        early_stopping_rounds=15,
        **params,
    )

    # LGBM Training
    training_start_time = time.time()
    LGBM.fit(X=df_metatrain,
             Y=df_metatrain_label,
             X_val=df_metaval,
             Y_val=df_metaval_label,
             categorical_feature=set([]))
    print(f"Training time: {time.time() - training_start_time} seconds")

    # LGBM Evaluation
    evaluation_start_time = time.time()
    prauc, rce, conf, max_pred, min_pred, avg = LGBM.evaluate(
        df_metaval.to_numpy(), df_metaval_label.to_numpy())
    print(
        "since I'm lazy I did the local test on the same test on which I did EarlyStopping"
    )
    print(f"PRAUC:\t{prauc}")
    print(f"RCE:\t{rce}")
    print(f"TN:\t{conf[0, 0]}")
    print(f"FP:\t{conf[0, 1]}")
    print(f"FN:\t{conf[1, 0]}")
    print(f"TP:\t{conf[1, 1]}")
    print(f"MAX_PRED:\t{max_pred}")
    print(f"MIN_PRED:\t{min_pred}")
    print(f"AVG:\t{avg}")
    print(f"Evaluation time: {time.time() - evaluation_start_time} seconds")

    # public prediction
    prediction(LGBM=LGBM, dataset_id=test_dataset, df=df_test, label=LABEL)

    # private prediction
    prediction(LGBM=LGBM,
               dataset_id=private_test_dataset,
               df=df_private,
               label=LABEL)
def main():
    # Instantiate the parser
    parser = argparse.ArgumentParser()

    parser.add_argument('label', type=str, help='required argument: label')

    args = parser.parse_args()

    LABEL = args.label

    assert LABEL in ["like", "reply", "retweet", "comment"], "LABEL not valid."

    print(f"label is {LABEL}")

    features = [
        "raw_feature_creator_follower_count",
        "raw_feature_creator_following_count",
        "raw_feature_engager_follower_count",
        "raw_feature_engager_following_count",
        "raw_feature_creator_is_verified",
        "raw_feature_engager_is_verified",
        "raw_feature_engagement_creator_follows_engager",
        "tweet_feature_number_of_photo",
        "tweet_feature_number_of_video",
        "tweet_feature_number_of_gif",
        "tweet_feature_number_of_media",
        "tweet_feature_is_retweet",
        "tweet_feature_is_quote",
        "tweet_feature_is_top_level",
        "tweet_feature_number_of_hashtags",
        "tweet_feature_creation_timestamp_hour",
        "tweet_feature_creation_timestamp_week_day",
        # "tweet_feature_number_of_mentions",
        "tweet_feature_token_length",
        "tweet_feature_token_length_unique",
        "tweet_feature_text_topic_word_count_adult_content",
        "tweet_feature_text_topic_word_count_kpop",
        "tweet_feature_text_topic_word_count_covid",
        "tweet_feature_text_topic_word_count_sport",
        "number_of_engagements_with_language_like",
        "number_of_engagements_with_language_retweet",
        "number_of_engagements_with_language_reply",
        "number_of_engagements_with_language_comment",
        "number_of_engagements_with_language_negative",
        "number_of_engagements_with_language_positive",
        "number_of_engagements_ratio_like",
        "number_of_engagements_ratio_retweet",
        "number_of_engagements_ratio_reply",
        "number_of_engagements_ratio_comment",
        "number_of_engagements_ratio_negative",
        "number_of_engagements_ratio_positive",
        "number_of_engagements_between_creator_and_engager_like",
        "number_of_engagements_between_creator_and_engager_retweet",
        "number_of_engagements_between_creator_and_engager_reply",
        "number_of_engagements_between_creator_and_engager_comment",
        "number_of_engagements_between_creator_and_engager_negative",
        "number_of_engagements_between_creator_and_engager_positive",
        "creator_feature_number_of_like_engagements_received",
        "creator_feature_number_of_retweet_engagements_received",
        "creator_feature_number_of_reply_engagements_received",
        "creator_feature_number_of_comment_engagements_received",
        "creator_feature_number_of_negative_engagements_received",
        "creator_feature_number_of_positive_engagements_received",
        "creator_feature_number_of_like_engagements_given",
        "creator_feature_number_of_retweet_engagements_given",
        "creator_feature_number_of_reply_engagements_given",
        "creator_feature_number_of_comment_engagements_given",
        "creator_feature_number_of_negative_engagements_given",
        "creator_feature_number_of_positive_engagements_given",
        "engager_feature_number_of_like_engagements_received",
        "engager_feature_number_of_retweet_engagements_received",
        "engager_feature_number_of_reply_engagements_received",
        "engager_feature_number_of_comment_engagements_received",
        "engager_feature_number_of_negative_engagements_received",
        "engager_feature_number_of_positive_engagements_received",
        "number_of_engagements_like",
        "number_of_engagements_retweet",
        "number_of_engagements_reply",
        "number_of_engagements_comment",
        "number_of_engagements_negative",
        "number_of_engagements_positive",
        "engager_feature_number_of_previous_like_engagement",
        "engager_feature_number_of_previous_reply_engagement",
        "engager_feature_number_of_previous_retweet_engagement",
        "engager_feature_number_of_previous_comment_engagement",
        "engager_feature_number_of_previous_positive_engagement",
        "engager_feature_number_of_previous_negative_engagement",
        "engager_feature_number_of_previous_engagement",
        "engager_feature_number_of_previous_like_engagement_ratio_1",
        "engager_feature_number_of_previous_reply_engagement_ratio_1",
        "engager_feature_number_of_previous_retweet_engagement_ratio_1",
        "engager_feature_number_of_previous_comment_engagement_ratio_1",
        "engager_feature_number_of_previous_positive_engagement_ratio_1",
        "engager_feature_number_of_previous_negative_engagement_ratio_1",
        "engager_feature_number_of_previous_like_engagement_ratio",
        "engager_feature_number_of_previous_reply_engagement_ratio",
        "engager_feature_number_of_previous_retweet_engagement_ratio",
        "engager_feature_number_of_previous_comment_engagement_ratio",
        "engager_feature_number_of_previous_positive_engagement_ratio",
        "engager_feature_number_of_previous_negative_engagement_ratio",
        "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_creator",
        "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_engager",
        "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_engager",
        # "tweet_feature_number_of_previous_like_engagements",
        # "tweet_feature_number_of_previous_reply_engagements",
        # "tweet_feature_number_of_previous_retweet_engagements",
        # "tweet_feature_number_of_previous_comment_engagements",
        # "tweet_feature_number_of_previous_positive_engagements",
        # "tweet_feature_number_of_previous_negative_engagements",
        "creator_feature_number_of_previous_like_engagements_given",
        "creator_feature_number_of_previous_reply_engagements_given",
        "creator_feature_number_of_previous_retweet_engagements_given",
        "creator_feature_number_of_previous_comment_engagements_given",
        "creator_feature_number_of_previous_positive_engagements_given",
        "creator_feature_number_of_previous_negative_engagements_given",
        "creator_feature_number_of_previous_like_engagements_received",
        "creator_feature_number_of_previous_reply_engagements_received",
        "creator_feature_number_of_previous_retweet_engagements_received",
        "creator_feature_number_of_previous_comment_engagements_received",
        "creator_feature_number_of_previous_positive_engagements_received",
        "creator_feature_number_of_previous_negative_engagements_received",
        "engager_feature_number_of_previous_like_engagement_with_language",
        "engager_feature_number_of_previous_reply_engagement_with_language",
        "engager_feature_number_of_previous_retweet_engagement_with_language",
        "engager_feature_number_of_previous_comment_engagement_with_language",
        "engager_feature_number_of_previous_positive_engagement_with_language",
        "engager_feature_number_of_previous_negative_engagement_with_language",
        "engager_feature_knows_hashtag_positive",
        "engager_feature_knows_hashtag_negative",
        "engager_feature_knows_hashtag_like",
        "engager_feature_knows_hashtag_reply",
        "engager_feature_knows_hashtag_rt",
        "engager_feature_knows_hashtag_comment",
        "creator_and_engager_have_same_main_language",
        "is_tweet_in_creator_main_language",
        "is_tweet_in_engager_main_language",
        # "statistical_probability_main_language_of_engager_engage_tweet_language_1",
        # "statistical_probability_main_language_of_engager_engage_tweet_language_2",
        "creator_and_engager_have_same_main_grouped_language",
        "is_tweet_in_creator_main_grouped_language",
        "is_tweet_in_engager_main_grouped_language",
        # # "hashtag_similarity_fold_ensembling_positive",
        # # "link_similarity_fold_ensembling_positive",
        # # "domain_similarity_fold_ensembling_positive"
        "tweet_feature_creation_timestamp_hour_shifted",
        "tweet_feature_creation_timestamp_day_phase",
        "tweet_feature_creation_timestamp_day_phase_shifted"
    ]

    label = [f"tweet_feature_engagement_is_{LABEL}"]

    train_dataset = "cherry_train"
    val_dataset = "cherry_val"
    test_dataset = "new_test"

    if LABEL in ["like"]:
        lgbm_params = like_params.lgbm_get_params()
        xgb_params = like_params.xgb_get_params()
    elif LABEL in ["reply"]:
        lgbm_params = reply_params.lgbm_get_params()
        xgb_params = reply_params.xgb_get_params()
    elif LABEL in ["retweet"]:
        lgbm_params = retweet_params.lgbm_get_params()
        xgb_params = retweet_params.xgb_get_params()
    elif LABEL in ["comment"]:
        lgbm_params = comment_params.lgbm_get_params()
        xgb_params = comment_params.xgb_get_params()
    else:
        assert False, "What?"

    categorical_features_set = set([])

    # Load train data
    # loading_data_start_time = time.time()
    # df_train, df_train_label = Data.get_dataset_xgb(train_dataset, features, label)
    # print(f"Loading train data time: {loading_data_start_time - time.time()} seconds")

    # Load val data
    df_val, df_val_label = Data.get_dataset_xgb(val_dataset, features, label)

    # Load test data
    df_test = Data.get_dataset(features, test_dataset)

    new_index = pd.Series(df_test.index).map(lambda x: x + len(df_val))
    df_test.set_index(new_index, inplace=True)

    # df to be predicted by the lgbm blending feature
    df_to_predict = pd.concat([df_val, df_test])

    # BLENDING FEATURE DECLARATION

    feature_list = []
    df_train, df_train_label = get_dataset_xgb_batch(total_n_split=1,
                                                     split_n=0,
                                                     dataset_id=train_dataset,
                                                     X_label=features,
                                                     Y_label=label,
                                                     sample=0.3)

    for lgbm_param_dict in lgbm_params:
        start_time = time.time()
        feature_list.append(
            LGBMEnsemblingFeature(
                dataset_id=train_dataset,
                df_train=df_train,
                df_train_label=df_train_label,
                df_to_predict=df_to_predict,
                param_dict=lgbm_param_dict,
                categorical_features_set=categorical_features_set))

    for xgb_param_dict in xgb_params:
        start_time = time.time()
        df_train, df_train_label = get_dataset_xgb_batch(
            total_n_split=1,
            split_n=0,
            dataset_id=train_dataset,
            X_label=features,
            Y_label=label,
            sample=0.1)
        feature_list.append(
            XGBEnsembling(
                dataset_id=train_dataset,
                df_train=df_train,
                df_train_label=df_train_label,
                df_to_predict=df_to_predict,
                param_dict=xgb_param_dict,
            ))

    df_feature_list = [x.load_or_create() for x in feature_list]

    # check dimensions
    len_val = len(df_val)

    for df_feat in df_feature_list:
        assert len(df_feat) == (len_val + len(df_test)), \
            f"Blending features are not of dimension expected, len val: {len_val} len test: {len(df_test)}\n " \
            f"obtained len: {len(df_feat)} of {df_feat.columns[0]}\n"

    # split feature dataframe in validation and testing
    df_feat_val_list = [df_feat.iloc[:len_val] for df_feat in df_feature_list]
    df_feat_test_list = [df_feat.iloc[len_val:] for df_feat in df_feature_list]

    df_val_to_be_concatenated_list = [df_val
                                      ] + df_feat_val_list + [df_val_label]
    df_test_to_be_concatenated_list = [df_test] + df_feat_test_list

    # creating the new validation set on which we will do meta optimization
    df_val = pd.concat(df_val_to_be_concatenated_list, axis=1)
    df_test = pd.concat(df_test_to_be_concatenated_list, axis=1)

    # now we are in full meta-model mode
    # watchout! they are unsorted now, you got to re-sort the dfs
    df_metatrain, df_metaval = train_test_split(df_val, test_size=0.3)
    df_metatrain.sort_index(inplace=True)
    df_metaval.sort_index(inplace=True)

    # split dataframe columns in train and label
    col_names_list = [df_feat.columns[0] for df_feat in df_feature_list]

    extended_features = features + col_names_list
    df_metatrain_label = df_metatrain[label]
    df_metatrain = df_metatrain[extended_features]

    df_metaval_label = df_metaval[label]
    df_metaval = df_metaval[extended_features]

    model_name = "lightgbm_classifier"
    kind = LABEL

    params = {
        'num_leaves': 544,
        'max_depth': 7,
        'lambda_l1': 50.0,
        'lambda_l2': 2.841130937148593,
        'colsample_bynode': 0.4,
        'colsample_bytree': 1.0,
        'bagging_fraction': 1.0,
        'bagging_freq': 8,
        'min_data_in_leaf': 611,
    }

    LGBM = LightGBM(
        objective='binary',
        num_threads=-1,
        num_iterations=1000,
        early_stopping_rounds=15,
        **params,
    )

    # LGBM Training
    training_start_time = time.time()
    LGBM.fit(X=df_metatrain,
             Y=df_metatrain_label,
             X_val=df_metaval,
             Y_val=df_metaval_label,
             categorical_feature=set([]))
    print(f"Training time: {time.time() - training_start_time} seconds")

    # LGBM Evaluation
    evaluation_start_time = time.time()
    prauc, rce, conf, max_pred, min_pred, avg = LGBM.evaluate(
        df_metaval.to_numpy(), df_metaval_label.to_numpy())
    print(
        "since I'm lazy I did the local test on the same test on which I did EarlyStopping"
    )
    print(f"PRAUC:\t{prauc}")
    print(f"RCE:\t{rce}")
    print(f"TN:\t{conf[0, 0]}")
    print(f"FP:\t{conf[0, 1]}")
    print(f"FN:\t{conf[1, 0]}")
    print(f"TP:\t{conf[1, 1]}")
    print(f"MAX_PRED:\t{max_pred}")
    print(f"MIN_PRED:\t{min_pred}")
    print(f"AVG:\t{avg}")
    print(f"Evaluation time: {time.time() - evaluation_start_time} seconds")

    tweets = Data.get_feature("raw_feature_tweet_id",
                              test_dataset)["raw_feature_tweet_id"].array
    users = Data.get_feature("raw_feature_engager_id",
                             test_dataset)["raw_feature_engager_id"].array

    # LGBM Prediction
    prediction_start_time = time.time()
    predictions = LGBM.get_prediction(df_test.to_numpy())
    print(f"Prediction time: {time.time() - prediction_start_time} seconds")

    # Uncomment to plot feature importance at the end of training
    # LGBM.plot_fimportance()

    create_submission_file(tweets, users, predictions,
                           f"{LABEL}_lgbm_blending_submission.csv")