def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) import Utils.Data.Data as data train_df = data.get_dataset([ f"mapped_feature_creator_id", f"mapped_feature_engager_id", f"tweet_feature_engagement_is_{self._get_suffix()}" ], train_dataset_id) if is_test_or_val_set(self.dataset_id): test_df = data.get_dataset( [f"mapped_feature_creator_id", f"mapped_feature_engager_id"], test_dataset_id) train_df = train_df[ train_df[f"tweet_feature_engagement_is_{self._get_suffix()}"] == True] res = compute(train_df, test_df) res.sort_index(inplace=True) self._save_test_result(res, test_dataset_id) else: # Compute the folds X_train_folds = np.array_split(train_df.sample(frac=1), self.number_of_folds) result = None for i in range(self.number_of_folds): local_train = pd.concat([ X_train_folds[x] for x in range(self.number_of_folds) if x is not i ]) local_train = local_train[local_train[ f"tweet_feature_engagement_is_{self._get_suffix()}"] == True] local_test = X_train_folds[i] res = compute(local_train, local_test) if result is None: result = res else: result = pd.concat([result, res]) self._save_train_result_if_not_present(result, train_dataset_id)
def prediction(LGBM, dataset_id, df, label): tweets = Data.get_feature("raw_feature_tweet_id", dataset_id)["raw_feature_tweet_id"].array users = Data.get_feature("raw_feature_engager_id", dataset_id)["raw_feature_engager_id"].array # LGBM Prediction prediction_start_time = time.time() predictions = LGBM.get_prediction(df.to_numpy()) print(f"Prediction time: {time.time() - prediction_start_time} seconds") # Uncomment to plot feature importance at the end of training # LGBM.plot_fimportance() create_submission_file(tweets, users, predictions, f"{dataset_id}_{label}_lgbm_blending_submission_2.csv")
def get_ensembling_label(label, dataset_id): from Utils.Data import Data return Data.get_feature_batch(f"tweet_feature_engagement_is_{label}", dataset_id, total_n_split=1, split_n=0, sample=0.3)
def create_feature(self): import Utils.Data.Data as data df = data.get_dataset([f"number_of_engagements_{self._get_suffix()}"], self.dataset_id) support_df = data.get_dataset([ f"number_of_engagements_positive", f"number_of_engagements_negative" ], self.dataset_id) df['total'] = support_df["number_of_engagements_positive"] + support_df[ "number_of_engagements_negative"] result = pd.DataFrame( df[f"number_of_engagements_{self._get_suffix()}"] / df["total"]) result.fillna(0, inplace=True) result.replace([np.inf, -np.inf], 0, inplace=True) self.save_feature(result)
def fit(self, X=None, Y=None): #Tries to load X and Y if not directly passed if (X is None) or (Y is None): X, Y = Data.get_dataset_xgb_default_train() print("Train set loaded from file.") #Learning in a single round if self.batch is False: #Transforming matrices in DMatrix type train = xgb.DMatrix(X, label=Y) #Defining and fitting the models self.sround_model = xgb.train(self.get_param_dict(), dtrain=train, num_boost_round=math.ceil( self.num_rounds)) #Learning by consecutive batches else: #Transforming matrices in DMatrix type train = xgb.DMatrix(X, label=Y) #Defining and training the model self.batch_model = xgb.train(self.get_param_dict(), dtrain=train, num_boost_round=math.ceil( self.num_rounds), xgb_model=self.batch_model)
def evaluate(self, X_tst=None, Y_tst=None): Y_pred = None #Tries to load X and Y if not directly passed if (X_tst is None) or (Y_tst is None): X_tst, Y_tst = Data.get_dataset_xgb_default_test() print("Test set loaded from file.") #Y_tst = np.array(Y_tst[Y_tst.columns[0]].astype(float)) if (self.sround_model is None) and (self.batch_model is None): print("No model trained yet.") else: #Selecting the coherent model for the evaluation #According to the initial declaration (batch/single round) if self.batch is False: model = self.sround_model else: model = self.batch_model #Preparing DMatrix #d_test = xgb.DMatrix(X_tst) #Making predictions #Y_pred = model.predict(d_test) Y_pred = self.get_prediction(X_tst) # Declaring the class containing the # metrics. cm = CoMe(Y_pred, Y_tst) #Evaluating scores = cm.compute_multiclass() return scores
def get_popularity(self): import Utils.Data.Data as data if self.popularity_path.is_file(): return np.load(self.popularity_path, allow_pickle=True) else: x = data.get_dataset( [ "mapped_feature_tweet_id", "mapped_feature_tweet_hashtags", "raw_feature_tweet_timestamp" ], self.dataset_id ) x.columns = ["tweet", "hashtags", "time"] x = x.drop_duplicates("tweet") x = x.set_index('time', drop=True) x = x.sort_index() # Group size n = self.window_size # Overlapping size m = self.window_overlap chunks = [x[i:i + n] for i in range(0, len(x), n - m)] result = process_map(compute_chunk, chunks) s = [r[0] for r in result] y = data.get_dataset( [ "mapped_feature_tweet_id", "mapped_feature_tweet_hashtags", "raw_feature_tweet_timestamp" ], self.dataset_id ) y.columns = ["tweet", "hashtags", "time"] get_popularity_partial = functools.partial(get_popularity, result=result, s=s) popularity = pd.concat(process_map(get_popularity_partial, np.array_split(y, 100))) self.popularity_path.parent.mkdir(parents=True, exist_ok=True) np.save(self.popularity_path, popularity, allow_pickle=True) return popularity
def fit(self, X=None, Y=None, X_valid=None, Y_valid=None): # Tries to load X and Y if not directly passed if (X is None) or (Y is None): X, Y = Data.get_dataset_xgb_default_train() print("Train set loaded from file.") # In case validation set is not provided set early stopping rounds to default if (X_valid is None) or (Y_valid is None): self.early_stopping_rounds = None valid = [] else: valid = xgb.DMatrix(X_valid, label=Y_valid) # Learning in a single round if self.batch is False: # Transforming matrices in DMatrix type train = xgb.DMatrix(X, label=Y) # Defining and fitting the models self.sround_model = xgb.train( self.get_param_dict(), early_stopping_rounds=self.early_stopping_rounds, evals=valid, dtrain=train, num_boost_round=math.ceil(self.num_rounds)) # Learning by consecutive batches else: # Transforming matrices in DMatrix type train = xgb.DMatrix(X, label=Y) # if we want to start from a model already saved if os.path.exists(self.previous_model_path): # Defining and training the model model = xgb.train( self.get_param_dict(), early_stopping_rounds=self.early_stopping_rounds, evals=valid, dtrain=train, xgb_model=self.previous_model_path) os.remove(self.previous_model_path) model.save_model(self.previous_model_path) del model # if we have no model saved else: model = xgb.train( self.get_param_dict(), early_stopping_rounds=self.early_stopping_rounds, evals=valid, dtrain=train) model.save_model(self.previous_model_path) del model
def get_prediction(self, X_tst=None): Y_pred = None # Tries to load X and Y if not directly passed if (X_tst is None): X_tst, _ = Data.get_dataset_xgb_default_test() print("Test set loaded from file.") if (self.sround_model is None) and (self.batch_model is None): print("No model trained yet.") else: # Preparing DMatrix d_test = xgb.DMatrix(X_tst) model = self.get_model() # Making predictions Y_pred = model.predict(d_test) return Y_pred
def get_prediction(self, X_tst=None): Y_pred = None #Tries to load X and Y if not directly passed if (X_tst is None): X_tst, _ = Data.get_dataset_xgb_default_test() print("Test set loaded from file.") if (self.sround_model is None) and (self.batch_model is None): print("No model trained yet.") else: #Selecting the coherent model for the evaluation #According to the initial declaration (batch/single round) if self.batch is False: model = self.sround_model else: model = self.batch_model #Preparing DMatrix d_test = xgb.DMatrix(X_tst) #Making predictions Y_pred = model.predict(d_test) return Y_pred
def evaluate(self, X_tst=None, Y_tst=None): Y_pred = None # Tries to load X and Y if not directly passed if (X_tst is None) or (Y_tst is None): X_tst, Y_tst = Data.get_dataset_xgb_default_test() print("Test set loaded from file.") Y_tst = np.array(Y_tst[Y_tst.columns[0]].astype(float)) if (self.sround_model is None) and (not os.path.exists( self.previous_model_path)): print("No model trained yet.") else: # Selecting the coherent model for the evaluation # According to the initial declaration (batch/single round) model = self.get_model() # Preparing DMatrix # d_test = xgb.DMatrix(X_tst) # Making predictions # Y_pred = model.predict(d_test) Y_pred = self.get_prediction(X_tst) # Declaring the class containing the # metrics. cm = CoMe(Y_pred, Y_tst) # Evaluating prauc = cm.compute_prauc() rce = cm.compute_rce() # Confusion matrix conf = confMatrix(Y_tst, Y_pred) # Prediction stats max_pred = max(Y_pred) min_pred = min(Y_pred) avg = np.mean(Y_pred) return prauc, rce, conf, max_pred, min_pred, avg
def create_feature(self): # Check if the dataset id is train or test if not is_test_or_val_set(self.dataset_id): # Compute train and test dataset ids train_dataset_id = self.dataset_id # Load the dataset and shuffle it import Utils.Data.Data as data X_train = data.get_dataset(features=self.features, dataset_id=train_dataset_id, nthread=64) print(X_train) print(X_train.memory_usage()) Y_train = data.get_dataset(features=self.label, dataset_id=train_dataset_id, nthread=64) print(Y_train) print(Y_train.memory_usage()) # Declare list of scores (of each folds) # used for aggregating results scores = [] kf = KFold(n_splits=4, shuffle=True, random_state=8) # Train multiple models with 1-fold out strategy for train_index, test_index in kf.split(X_train): train_index = np.random.choice(train_index, int(len(train_index) / 20), replace=True) local_X_train = X_train.iloc[train_index] local_Y_train = Y_train.iloc[train_index] # Compute the test set local_X_test = X_train.iloc[test_index] # Generate the dataset id for this fold fold_dataset_id = f"{self.feature_name}_{self.dataset_id}_fold_{len(scores)}" # Create the sub-feature feature = XGBEnsembling(fold_dataset_id, local_X_train, local_Y_train, local_X_test, self.param_dict) # Retrieve the scores scores.append( pd.DataFrame(feature.load_or_create(), index=local_X_test.index)) print(scores) # Compute the resulting dataframe and sort the results result = pd.concat(scores).sort_index() # Save it as a feature self.save_feature(result) else: test_dataset_id = self.dataset_id train_dataset_id = get_train_set_id_from_test_or_val_set( test_dataset_id) # Load the train dataset import Utils.Data.Data as data X_train = data.get_dataset_batch(features=self.features, dataset_id=train_dataset_id, total_n_split=1, split_n=0, sample=0.05) Y_train = data.get_dataset_batch(features=self.label, dataset_id=train_dataset_id, total_n_split=1, split_n=0, sample=0.05) # Load the test dataset X_test = data.get_dataset(features=self.features, dataset_id=test_dataset_id, nthread=64) fold_dataset_id = f"{self.feature_name}_{self.dataset_id}" # Create the sub-feature feature = XGBEnsembling(fold_dataset_id, X_train, Y_train, X_test, self.param_dict) # Retrieve the scores result = pd.DataFrame(feature.load_or_create(), index=X_test.index) # Save it as a feature self.save_feature(result)
def main(): # Instantiate the parser parser = argparse.ArgumentParser() parser.add_argument('label', type=str, help='required argument: label') args = parser.parse_args() nn_labels = ["like", "reply", "retweet", "comment"] LABEL = args.label assert LABEL in ["like", "reply", "retweet", "comment"], "LABEL not valid." print(f"label is {LABEL}") features = [ "raw_feature_creator_follower_count", "raw_feature_creator_following_count", "raw_feature_engager_follower_count", "raw_feature_engager_following_count", "raw_feature_creator_is_verified", "raw_feature_engager_is_verified", "raw_feature_engagement_creator_follows_engager", "tweet_feature_number_of_photo", "tweet_feature_number_of_video", "tweet_feature_number_of_gif", "tweet_feature_number_of_media", "tweet_feature_is_retweet", "tweet_feature_is_quote", "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags", "tweet_feature_creation_timestamp_hour", "tweet_feature_creation_timestamp_week_day", # "tweet_feature_number_of_mentions", "tweet_feature_token_length", "tweet_feature_token_length_unique", "tweet_feature_text_topic_word_count_adult_content", "tweet_feature_text_topic_word_count_kpop", "tweet_feature_text_topic_word_count_covid", "tweet_feature_text_topic_word_count_sport", "number_of_engagements_with_language_like", "number_of_engagements_with_language_retweet", "number_of_engagements_with_language_reply", "number_of_engagements_with_language_comment", "number_of_engagements_with_language_negative", "number_of_engagements_with_language_positive", "number_of_engagements_ratio_like", "number_of_engagements_ratio_retweet", "number_of_engagements_ratio_reply", "number_of_engagements_ratio_comment", "number_of_engagements_ratio_negative", "number_of_engagements_ratio_positive", "number_of_engagements_between_creator_and_engager_like", "number_of_engagements_between_creator_and_engager_retweet", "number_of_engagements_between_creator_and_engager_reply", "number_of_engagements_between_creator_and_engager_comment", "number_of_engagements_between_creator_and_engager_negative", "number_of_engagements_between_creator_and_engager_positive", "creator_feature_number_of_like_engagements_received", "creator_feature_number_of_retweet_engagements_received", "creator_feature_number_of_reply_engagements_received", "creator_feature_number_of_comment_engagements_received", "creator_feature_number_of_negative_engagements_received", "creator_feature_number_of_positive_engagements_received", "creator_feature_number_of_like_engagements_given", "creator_feature_number_of_retweet_engagements_given", "creator_feature_number_of_reply_engagements_given", "creator_feature_number_of_comment_engagements_given", "creator_feature_number_of_negative_engagements_given", "creator_feature_number_of_positive_engagements_given", "engager_feature_number_of_like_engagements_received", "engager_feature_number_of_retweet_engagements_received", "engager_feature_number_of_reply_engagements_received", "engager_feature_number_of_comment_engagements_received", "engager_feature_number_of_negative_engagements_received", "engager_feature_number_of_positive_engagements_received", "number_of_engagements_like", "number_of_engagements_retweet", "number_of_engagements_reply", "number_of_engagements_comment", "number_of_engagements_negative", "number_of_engagements_positive", "engager_feature_number_of_previous_like_engagement", "engager_feature_number_of_previous_reply_engagement", "engager_feature_number_of_previous_retweet_engagement", "engager_feature_number_of_previous_comment_engagement", "engager_feature_number_of_previous_positive_engagement", "engager_feature_number_of_previous_negative_engagement", "engager_feature_number_of_previous_engagement", "engager_feature_number_of_previous_like_engagement_ratio_1", "engager_feature_number_of_previous_reply_engagement_ratio_1", "engager_feature_number_of_previous_retweet_engagement_ratio_1", "engager_feature_number_of_previous_comment_engagement_ratio_1", "engager_feature_number_of_previous_positive_engagement_ratio_1", "engager_feature_number_of_previous_negative_engagement_ratio_1", "engager_feature_number_of_previous_like_engagement_ratio", "engager_feature_number_of_previous_reply_engagement_ratio", "engager_feature_number_of_previous_retweet_engagement_ratio", "engager_feature_number_of_previous_comment_engagement_ratio", "engager_feature_number_of_previous_positive_engagement_ratio", "engager_feature_number_of_previous_negative_engagement_ratio", "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_engager", # "tweet_feature_number_of_previous_like_engagements", # "tweet_feature_number_of_previous_reply_engagements", # "tweet_feature_number_of_previous_retweet_engagements", # "tweet_feature_number_of_previous_comment_engagements", # "tweet_feature_number_of_previous_positive_engagements", # "tweet_feature_number_of_previous_negative_engagements", "creator_feature_number_of_previous_like_engagements_given", "creator_feature_number_of_previous_reply_engagements_given", "creator_feature_number_of_previous_retweet_engagements_given", "creator_feature_number_of_previous_comment_engagements_given", "creator_feature_number_of_previous_positive_engagements_given", "creator_feature_number_of_previous_negative_engagements_given", "creator_feature_number_of_previous_like_engagements_received", "creator_feature_number_of_previous_reply_engagements_received", "creator_feature_number_of_previous_retweet_engagements_received", "creator_feature_number_of_previous_comment_engagements_received", "creator_feature_number_of_previous_positive_engagements_received", "creator_feature_number_of_previous_negative_engagements_received", "engager_feature_number_of_previous_like_engagement_with_language", "engager_feature_number_of_previous_reply_engagement_with_language", "engager_feature_number_of_previous_retweet_engagement_with_language", "engager_feature_number_of_previous_comment_engagement_with_language", "engager_feature_number_of_previous_positive_engagement_with_language", "engager_feature_number_of_previous_negative_engagement_with_language", "engager_feature_knows_hashtag_positive", "engager_feature_knows_hashtag_negative", "engager_feature_knows_hashtag_like", "engager_feature_knows_hashtag_reply", "engager_feature_knows_hashtag_rt", "engager_feature_knows_hashtag_comment", "creator_and_engager_have_same_main_language", "is_tweet_in_creator_main_language", "is_tweet_in_engager_main_language", # "statistical_probability_main_language_of_engager_engage_tweet_language_1", # "statistical_probability_main_language_of_engager_engage_tweet_language_2", "creator_and_engager_have_same_main_grouped_language", "is_tweet_in_creator_main_grouped_language", "is_tweet_in_engager_main_grouped_language", # # "hashtag_similarity_fold_ensembling_positive", # # "link_similarity_fold_ensembling_positive", # # "domain_similarity_fold_ensembling_positive" "tweet_feature_creation_timestamp_hour_shifted", "tweet_feature_creation_timestamp_day_phase", "tweet_feature_creation_timestamp_day_phase_shifted" ] label = [f"tweet_feature_engagement_is_{LABEL}"] train_dataset = "cherry_train" val_dataset = "cherry_val" test_dataset = "new_test" ensembling_list_dict = { 'like': [], 'reply': ['reply', 'retweet', 'comment'], 'retweet': ['reply', 'retweet', 'comment'], 'comment': ['reply', 'retweet', 'comment'], } ensembling_list = ensembling_list_dict[LABEL] ensembling_lgbm_params = {} ensembling_xgb_params = {} for ens_label in ensembling_list: ensembling_lgbm_params[ens_label], ensembling_xgb_params[ens_label]\ = params_by_label(ens_label) categorical_features_set = set([]) # Load train data # loading_data_start_time = time.time() # df_train, df_train_label = Data.get_dataset_xgb(train_dataset, features, label) # print(f"Loading train data time: {loading_data_start_time - time.time()} seconds") # Load val data df_val, df_val_label = Data.get_dataset_xgb(val_dataset, features, label) # Load test data df_test = Data.get_dataset(features, test_dataset) new_index = pd.Series(df_test.index).map(lambda x: x + len(df_val)) df_test.set_index(new_index, inplace=True) # df to be predicted by the lgbm blending feature df_to_predict = pd.concat([df_val, df_test]) # BLENDING FEATURE DECLARATION feature_list = [] df_train = pd.DataFrame(columns=features) df_train_label = pd.DataFrame(columns=label) need_to_load_train_set = False for ens_label in ensembling_list: lgbm_params = ensembling_lgbm_params[ens_label] for lgbm_param_dict in lgbm_params: start_time = time.time() if not LGBMEnsemblingFeature( dataset_id=train_dataset, df_train=df_train, df_train_label=get_ensembling_label( ens_label, train_dataset), df_to_predict=df_to_predict, param_dict=lgbm_param_dict, categorical_features_set=categorical_features_set ).has_feature(): print(f"{ens_label} {lgbm_param_dict}") need_to_load_train_set = True if need_to_load_train_set: df_train, df_train_label = get_dataset_xgb_batch( total_n_split=1, split_n=0, dataset_id=train_dataset, X_label=features, Y_label=label, sample=0.3) for ens_label in ensembling_list: lgbm_params = ensembling_lgbm_params[ens_label] for lgbm_param_dict in lgbm_params: start_time = time.time() feature_list.append( LGBMEnsemblingFeature( dataset_id=train_dataset, df_train=df_train, df_train_label=get_ensembling_label( ens_label, train_dataset), df_to_predict=df_to_predict, param_dict=lgbm_param_dict, categorical_features_set=categorical_features_set)) print(f"time: {time.time()-start_time}") del df_train, df_train_label # NEW PARTll # ONLY THIS PART IS NEW # LOAD THIS PART FIRST df_feature_list = [x.load_or_create() for x in feature_list] for ens_label in ensembling_list: start_time = time.time() if ens_label == "like": val_features_df = XGBFoldEnsemblingLike2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingLike2( test_dataset).load_or_create() elif ens_label == "retweet": val_features_df = XGBFoldEnsemblingRetweet2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingRetweet2( test_dataset).load_or_create() elif ens_label == "reply": val_features_df = XGBFoldEnsemblingReply2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingReply2( test_dataset).load_or_create() elif ens_label == "comment": val_features_df = XGBFoldEnsemblingComment2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingComment2( test_dataset).load_or_create() else: assert False, "oh oh something went wrong. label not found" test_features_df.set_index(new_index, inplace=True) xgb_feature_df = pd.concat([val_features_df, test_features_df]) df_feature_list.append(xgb_feature_df) print(f"time: {time.time() - start_time}") del val_features_df, test_features_df # check dimensions len_val = len(df_val) for df_feat in df_feature_list: assert len(df_feat) == (len_val + len(df_test)), \ f"Blending features are not of dimension expected, len val: {len_val} len test: {len(df_test)}\n " \ f"obtained len: {len(df_feat)} of {df_feat.columns[0]}\n" # split feature dataframe in validation and testing df_feat_val_list = [df_feat.iloc[:len_val] for df_feat in df_feature_list] #df_feat_test_list = [df_feat.iloc[len_val:] for df_feat in df_feature_list] df_feat_nn_val_list_1 = [ get_nn_prediction(l, 1, val_dataset) for l in nn_labels ] df_feat_nn_val_list_2 = [ get_nn_prediction(l, 2, val_dataset) for l in nn_labels ] df_feat_val_list += df_feat_nn_val_list_1 + df_feat_nn_val_list_2 df_to_be_concatenated_list = [df_val] + df_feat_val_list + [df_val_label] # creating the new validation set on which we will do meta optimization df_val = pd.concat(df_to_be_concatenated_list, axis=1) # now we are in full meta-model mode # watchout! they are unsorted now, you got to re-sort the dfs df_metatrain, df_metaval = train_test_split(df_val, test_size=0.2) df_metatrain.sort_index(inplace=True) df_metaval.sort_index(inplace=True) # split dataframe columns in train and label col_names_list = [df_feat.columns[0] for df_feat in df_feature_list] extended_features = features + col_names_list df_metatrain_label = df_metatrain[label] df_metatrain = df_metatrain[extended_features] df_metaval_label = df_metaval[label] df_metaval = df_metaval[extended_features] model_name = "lightgbm_classifier" kind = LABEL OP = Optimizer(model_name, kind, mode=0, path=LABEL, path_log=f"blending-lgbm-{LABEL}-twonn-reg", make_log=True, make_save=False, auto_save=False) OP.setParameters(n_calls=100, n_random_starts=30) OP.loadTrainData(df_metatrain, df_metatrain_label) OP.loadValData(df_metaval, df_metaval_label) # early stopping OP.loadTestData(df_metaval, df_metaval_label) # evaluate objective OP.setParamsLGB(objective='binary', early_stopping_rounds=10, eval_metric="binary", is_unbalance=False) OP.setCategoricalFeatures(categorical_features_set) # OP.loadModelHardCoded() res = OP.optimize()
def get_ensembling_label(label, dataset_id): from Utils.Data import Data return Data.get_feature(f"tweet_feature_engagement_is_{label}", dataset_id)
def main(): # Instantiate the parser parser = argparse.ArgumentParser() parser.add_argument('label', type=str, help='required argument: label') args = parser.parse_args() nn_labels = ["like", "reply", "retweet", "comment"] LABEL = args.label assert LABEL in ["like", "reply", "retweet", "comment"], "LABEL not valid." print(f"label is {LABEL}") features = [ "raw_feature_creator_follower_count", "raw_feature_creator_following_count", "raw_feature_engager_follower_count", "raw_feature_engager_following_count", "raw_feature_creator_is_verified", "raw_feature_engager_is_verified", "raw_feature_engagement_creator_follows_engager", "tweet_feature_number_of_photo", "tweet_feature_number_of_video", "tweet_feature_number_of_gif", "tweet_feature_number_of_media", "tweet_feature_is_retweet", "tweet_feature_is_quote", "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags", "tweet_feature_creation_timestamp_hour", "tweet_feature_creation_timestamp_week_day", # "tweet_feature_number_of_mentions", "tweet_feature_token_length", "tweet_feature_token_length_unique", "tweet_feature_text_topic_word_count_adult_content", "tweet_feature_text_topic_word_count_kpop", "tweet_feature_text_topic_word_count_covid", "tweet_feature_text_topic_word_count_sport", "number_of_engagements_with_language_like", "number_of_engagements_with_language_retweet", "number_of_engagements_with_language_reply", "number_of_engagements_with_language_comment", "number_of_engagements_with_language_negative", "number_of_engagements_with_language_positive", "number_of_engagements_ratio_like", "number_of_engagements_ratio_retweet", "number_of_engagements_ratio_reply", "number_of_engagements_ratio_comment", "number_of_engagements_ratio_negative", "number_of_engagements_ratio_positive", "number_of_engagements_between_creator_and_engager_like", "number_of_engagements_between_creator_and_engager_retweet", "number_of_engagements_between_creator_and_engager_reply", "number_of_engagements_between_creator_and_engager_comment", "number_of_engagements_between_creator_and_engager_negative", "number_of_engagements_between_creator_and_engager_positive", "creator_feature_number_of_like_engagements_received", "creator_feature_number_of_retweet_engagements_received", "creator_feature_number_of_reply_engagements_received", "creator_feature_number_of_comment_engagements_received", "creator_feature_number_of_negative_engagements_received", "creator_feature_number_of_positive_engagements_received", "creator_feature_number_of_like_engagements_given", "creator_feature_number_of_retweet_engagements_given", "creator_feature_number_of_reply_engagements_given", "creator_feature_number_of_comment_engagements_given", "creator_feature_number_of_negative_engagements_given", "creator_feature_number_of_positive_engagements_given", "engager_feature_number_of_like_engagements_received", "engager_feature_number_of_retweet_engagements_received", "engager_feature_number_of_reply_engagements_received", "engager_feature_number_of_comment_engagements_received", "engager_feature_number_of_negative_engagements_received", "engager_feature_number_of_positive_engagements_received", "number_of_engagements_like", "number_of_engagements_retweet", "number_of_engagements_reply", "number_of_engagements_comment", "number_of_engagements_negative", "number_of_engagements_positive", "engager_feature_number_of_previous_like_engagement", "engager_feature_number_of_previous_reply_engagement", "engager_feature_number_of_previous_retweet_engagement", "engager_feature_number_of_previous_comment_engagement", "engager_feature_number_of_previous_positive_engagement", "engager_feature_number_of_previous_negative_engagement", "engager_feature_number_of_previous_engagement", "engager_feature_number_of_previous_like_engagement_ratio_1", "engager_feature_number_of_previous_reply_engagement_ratio_1", "engager_feature_number_of_previous_retweet_engagement_ratio_1", "engager_feature_number_of_previous_comment_engagement_ratio_1", "engager_feature_number_of_previous_positive_engagement_ratio_1", "engager_feature_number_of_previous_negative_engagement_ratio_1", "engager_feature_number_of_previous_like_engagement_ratio", "engager_feature_number_of_previous_reply_engagement_ratio", "engager_feature_number_of_previous_retweet_engagement_ratio", "engager_feature_number_of_previous_comment_engagement_ratio", "engager_feature_number_of_previous_positive_engagement_ratio", "engager_feature_number_of_previous_negative_engagement_ratio", "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_engager", # "tweet_feature_number_of_previous_like_engagements", # "tweet_feature_number_of_previous_reply_engagements", # "tweet_feature_number_of_previous_retweet_engagements", # "tweet_feature_number_of_previous_comment_engagements", # "tweet_feature_number_of_previous_positive_engagements", # "tweet_feature_number_of_previous_negative_engagements", "creator_feature_number_of_previous_like_engagements_given", "creator_feature_number_of_previous_reply_engagements_given", "creator_feature_number_of_previous_retweet_engagements_given", "creator_feature_number_of_previous_comment_engagements_given", "creator_feature_number_of_previous_positive_engagements_given", "creator_feature_number_of_previous_negative_engagements_given", "creator_feature_number_of_previous_like_engagements_received", "creator_feature_number_of_previous_reply_engagements_received", "creator_feature_number_of_previous_retweet_engagements_received", "creator_feature_number_of_previous_comment_engagements_received", "creator_feature_number_of_previous_positive_engagements_received", "creator_feature_number_of_previous_negative_engagements_received", "engager_feature_number_of_previous_like_engagement_with_language", "engager_feature_number_of_previous_reply_engagement_with_language", "engager_feature_number_of_previous_retweet_engagement_with_language", "engager_feature_number_of_previous_comment_engagement_with_language", "engager_feature_number_of_previous_positive_engagement_with_language", "engager_feature_number_of_previous_negative_engagement_with_language", "engager_feature_knows_hashtag_positive", "engager_feature_knows_hashtag_negative", "engager_feature_knows_hashtag_like", "engager_feature_knows_hashtag_reply", "engager_feature_knows_hashtag_rt", "engager_feature_knows_hashtag_comment", "creator_and_engager_have_same_main_language", "is_tweet_in_creator_main_language", "is_tweet_in_engager_main_language", # "statistical_probability_main_language_of_engager_engage_tweet_language_1", # "statistical_probability_main_language_of_engager_engage_tweet_language_2", "creator_and_engager_have_same_main_grouped_language", "is_tweet_in_creator_main_grouped_language", "is_tweet_in_engager_main_grouped_language", # # "hashtag_similarity_fold_ensembling_positive", # # "link_similarity_fold_ensembling_positive", # # "domain_similarity_fold_ensembling_positive" "tweet_feature_creation_timestamp_hour_shifted", "tweet_feature_creation_timestamp_day_phase", "tweet_feature_creation_timestamp_day_phase_shifted" ] label = [f"tweet_feature_engagement_is_{LABEL}"] train_dataset = "cherry_train" val_dataset = "cherry_val" test_dataset = "new_test" private_test_dataset = "last_test" ensembling_list_dict = { 'like': ['reply', 'retweet', 'comment'], 'reply': ['reply', 'retweet', 'comment'], 'retweet': ['reply', 'retweet', 'comment'], 'comment': ['reply', 'retweet', 'comment'], } ensembling_list = ensembling_list_dict[LABEL] ensembling_lgbm_params = {} ensembling_xgb_params = {} for ens_label in ensembling_list: ensembling_lgbm_params[ens_label], ensembling_xgb_params[ens_label] \ = params_by_label(ens_label) categorical_features_set = set([]) # Load train data # loading_data_start_time = time.time() # df_train, df_train_label = Data.get_dataset_xgb(train_dataset, features, label) # print(f"Loading train data time: {loading_data_start_time - time.time()} seconds") # Load val data df_val, df_val_label = Data.get_dataset_xgb(val_dataset, features, label) # Load test data df_test = Data.get_dataset(features, test_dataset) df_private = Data.get_dataset(features, private_test_dataset) new_index = pd.Series(df_test.index).map(lambda x: x + len(df_val)) df_test.set_index(new_index, inplace=True) new_index_private = pd.Series( df_private.index).map(lambda x: x + len(df_val) + len(df_test)) df_private.set_index(new_index_private, inplace=True) # df to be predicted by the lgbm blending feature df_to_predict = pd.concat([df_val, df_test, df_private]) # BLENDING FEATURE DECLARATION feature_list = [] # NEW CODE ADDED df_train = pd.DataFrame(columns=features) df_train_label = pd.DataFrame(columns=label) need_to_load_train_set = False for ens_label in ensembling_list: lgbm_params = ensembling_lgbm_params[ens_label] for lgbm_param_dict in lgbm_params: start_time = time.time() if not LGBMEnsemblingFeature( dataset_id=private_test_dataset, df_train=df_train, df_train_label=get_ensembling_label( ens_label, train_dataset), df_to_predict=df_to_predict, param_dict=lgbm_param_dict, categorical_features_set=categorical_features_set ).has_feature(): print(f"{ens_label} {lgbm_param_dict}") need_to_load_train_set = True if need_to_load_train_set: df_train, df_train_label = get_dataset_xgb_batch( total_n_split=1, split_n=0, dataset_id=train_dataset, X_label=features, Y_label=label, sample=0.3) for ens_label in ensembling_list: lgbm_params = ensembling_lgbm_params[ens_label] for lgbm_param_dict in lgbm_params: start_time = time.time() feature_list.append( LGBMEnsemblingFeature( dataset_id=private_test_dataset, df_train=df_train, df_train_label=get_ensembling_label( ens_label, train_dataset), df_to_predict=df_to_predict, param_dict=lgbm_param_dict, categorical_features_set=categorical_features_set)) # NEW PARTll # ONLY THIS PART IS NEW # LOAD THIS PART FIRST del df_train, df_train_label df_feature_list = [x.load_or_create() for x in tqdm(feature_list)] for ens_label in ensembling_list: start_time = time.time() if ens_label == "like": val_features_df = XGBFoldEnsemblingLike2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingLike2( test_dataset).load_or_create() private_features_df = XGBFoldEnsemblingLike2( private_test_dataset).load_or_create() elif ens_label == "retweet": val_features_df = XGBFoldEnsemblingRetweet2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingRetweet2( test_dataset).load_or_create() private_features_df = XGBFoldEnsemblingRetweet2( private_test_dataset).load_or_create() elif ens_label == "reply": val_features_df = XGBFoldEnsemblingReply2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingReply2( test_dataset).load_or_create() private_features_df = XGBFoldEnsemblingReply2( private_test_dataset).load_or_create() elif ens_label == "comment": val_features_df = XGBFoldEnsemblingComment2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingComment2( test_dataset).load_or_create() private_features_df = XGBFoldEnsemblingComment2( private_test_dataset).load_or_create() else: assert False, "oh oh something went wrong. label not found" test_features_df.set_index(new_index, inplace=True) private_features_df.set_index(new_index_private, inplace=True) xgb_feature_df = pd.concat( [val_features_df, test_features_df, private_features_df]) df_feature_list.append(xgb_feature_df) print(f"time: {time.time() - start_time}") del val_features_df, test_features_df, private_features_df # check dimensions len_val = len(df_val) len_test = len(df_test) len_private = len(df_private) for df_feat in df_feature_list: assert len(df_feat) == (len_val + len_test + len_private), \ f"Blending features are not of dimension expected, len val: {len_val} len test: {len_test}" \ f" len private test: {len_private}\n " \ f"obtained len: {len(df_feat)} of {df_feat.columns[0]}\n" # split feature dataframe in validation and testing df_feat_val_list = [df_feat.iloc[:len_val] for df_feat in df_feature_list] df_feat_test_list = [ df_feat.iloc[len_val:-len_private] for df_feat in df_feature_list ] df_feat_private_list = [ df_feat.iloc[-len_private:] for df_feat in df_feature_list ] df_feat_nn_val_list = [ get_nn_prediction(l, val_dataset) for l in nn_labels ] df_feat_nn_test_list = [ get_nn_prediction(l, test_dataset) for l in nn_labels ] df_feat_nn_private_list = [ get_nn_prediction(l, private_test_dataset) for l in nn_labels ] for df_feat_nn_test in df_feat_nn_test_list: new_index = pd.Series( df_feat_nn_test.index).map(lambda x: x + len(df_val)) df_feat_nn_test.set_index(new_index, inplace=True) for df_feat_nn_private in df_feat_nn_private_list: new_index_private = pd.Series(df_feat_nn_private.index).map( lambda x: x + len(df_val) + len(df_test)) df_feat_nn_private.set_index(new_index_private, inplace=True) df_feat_val_list += df_feat_nn_val_list df_feat_test_list += df_feat_nn_test_list df_feat_private_list += df_feat_nn_private_list df_val_to_be_concatenated_list = [df_val ] + df_feat_val_list + [df_val_label] df_test_to_be_concatenated_list = [df_test] + df_feat_test_list df_private_to_be_concatenated_list = [df_private] + df_feat_private_list # creating the new validation set on which we will do meta optimization df_val = pd.concat(df_val_to_be_concatenated_list, axis=1) df_test = pd.concat(df_test_to_be_concatenated_list, axis=1) df_private = pd.concat(df_private_to_be_concatenated_list, axis=1) # now we are in full meta-model mode # watchout! they are unsorted now, you got to re-sort the dfs df_metatrain, df_metaval = train_test_split(df_val, test_size=0.1, random_state=16 + 1) df_metatrain.sort_index(inplace=True) df_metaval.sort_index(inplace=True) # split dataframe columns in train and label col_names_list = [df_feat.columns[0] for df_feat in df_feature_list] extended_features = df_test.columns df_metatrain_label = df_metatrain[label] df_metatrain = df_metatrain[extended_features] df_metaval_label = df_metaval[label] df_metaval = df_metaval[extended_features] for i in range(len(df_metatrain.columns)): assert df_metatrain.columns[i] == df_test.columns[i], f'You f****d yourself. metatrain col {i}: {df_metatrain.columns[i]}' \ f' test col {i}: {df_test.columns[i]}' assert df_metatrain.columns[i] == df_private.columns[i], \ f'You f****d yourself. metatrain col {i}: {df_metatrain.columns[i]} private test col {i}: {df_test.columns[i]}' model_name = "lightgbm_classifier" kind = LABEL params = { 'num_leaves': 200.4606708311663, 'learning_rate': 0.02250057258744298, 'max_depth': 47, 'lambda_l1': 3.037842501865099, 'lambda_l2': 1.0, 'colsample_bynode': 0.4, 'colsample_bytree': 0.4, 'bagging_fraction': 0.8, 'bagging_freq': 10, 'max_bin': 3344.071500013681, 'min_data_in_leaf': 10.0 } LGBM = LightGBM( objective='binary', num_threads=-1, num_iterations=1000, early_stopping_rounds=15, **params, ) # LGBM Training training_start_time = time.time() LGBM.fit(X=df_metatrain, Y=df_metatrain_label, X_val=df_metaval, Y_val=df_metaval_label, categorical_feature=set([])) print(f"Training time: {time.time() - training_start_time} seconds") # LGBM Evaluation evaluation_start_time = time.time() prauc, rce, conf, max_pred, min_pred, avg = LGBM.evaluate( df_metaval.to_numpy(), df_metaval_label.to_numpy()) print( "since I'm lazy I did the local test on the same test on which I did EarlyStopping" ) print(f"PRAUC:\t{prauc}") print(f"RCE:\t{rce}") print(f"TN:\t{conf[0, 0]}") print(f"FP:\t{conf[0, 1]}") print(f"FN:\t{conf[1, 0]}") print(f"TP:\t{conf[1, 1]}") print(f"MAX_PRED:\t{max_pred}") print(f"MIN_PRED:\t{min_pred}") print(f"AVG:\t{avg}") print(f"Evaluation time: {time.time() - evaluation_start_time} seconds") # public prediction prediction(LGBM=LGBM, dataset_id=test_dataset, df=df_test, label=LABEL) # private prediction prediction(LGBM=LGBM, dataset_id=private_test_dataset, df=df_private, label=LABEL)
def main(): # Instantiate the parser parser = argparse.ArgumentParser() parser.add_argument('label', type=str, help='required argument: label') args = parser.parse_args() LABEL = args.label assert LABEL in ["like", "reply", "retweet", "comment"], "LABEL not valid." print(f"label is {LABEL}") features = [ "raw_feature_creator_follower_count", "raw_feature_creator_following_count", "raw_feature_engager_follower_count", "raw_feature_engager_following_count", "raw_feature_creator_is_verified", "raw_feature_engager_is_verified", "raw_feature_engagement_creator_follows_engager", "tweet_feature_number_of_photo", "tweet_feature_number_of_video", "tweet_feature_number_of_gif", "tweet_feature_number_of_media", "tweet_feature_is_retweet", "tweet_feature_is_quote", "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags", "tweet_feature_creation_timestamp_hour", "tweet_feature_creation_timestamp_week_day", # "tweet_feature_number_of_mentions", "tweet_feature_token_length", "tweet_feature_token_length_unique", "tweet_feature_text_topic_word_count_adult_content", "tweet_feature_text_topic_word_count_kpop", "tweet_feature_text_topic_word_count_covid", "tweet_feature_text_topic_word_count_sport", "number_of_engagements_with_language_like", "number_of_engagements_with_language_retweet", "number_of_engagements_with_language_reply", "number_of_engagements_with_language_comment", "number_of_engagements_with_language_negative", "number_of_engagements_with_language_positive", "number_of_engagements_ratio_like", "number_of_engagements_ratio_retweet", "number_of_engagements_ratio_reply", "number_of_engagements_ratio_comment", "number_of_engagements_ratio_negative", "number_of_engagements_ratio_positive", "number_of_engagements_between_creator_and_engager_like", "number_of_engagements_between_creator_and_engager_retweet", "number_of_engagements_between_creator_and_engager_reply", "number_of_engagements_between_creator_and_engager_comment", "number_of_engagements_between_creator_and_engager_negative", "number_of_engagements_between_creator_and_engager_positive", "creator_feature_number_of_like_engagements_received", "creator_feature_number_of_retweet_engagements_received", "creator_feature_number_of_reply_engagements_received", "creator_feature_number_of_comment_engagements_received", "creator_feature_number_of_negative_engagements_received", "creator_feature_number_of_positive_engagements_received", "creator_feature_number_of_like_engagements_given", "creator_feature_number_of_retweet_engagements_given", "creator_feature_number_of_reply_engagements_given", "creator_feature_number_of_comment_engagements_given", "creator_feature_number_of_negative_engagements_given", "creator_feature_number_of_positive_engagements_given", "engager_feature_number_of_like_engagements_received", "engager_feature_number_of_retweet_engagements_received", "engager_feature_number_of_reply_engagements_received", "engager_feature_number_of_comment_engagements_received", "engager_feature_number_of_negative_engagements_received", "engager_feature_number_of_positive_engagements_received", "number_of_engagements_like", "number_of_engagements_retweet", "number_of_engagements_reply", "number_of_engagements_comment", "number_of_engagements_negative", "number_of_engagements_positive", "engager_feature_number_of_previous_like_engagement", "engager_feature_number_of_previous_reply_engagement", "engager_feature_number_of_previous_retweet_engagement", "engager_feature_number_of_previous_comment_engagement", "engager_feature_number_of_previous_positive_engagement", "engager_feature_number_of_previous_negative_engagement", "engager_feature_number_of_previous_engagement", "engager_feature_number_of_previous_like_engagement_ratio_1", "engager_feature_number_of_previous_reply_engagement_ratio_1", "engager_feature_number_of_previous_retweet_engagement_ratio_1", "engager_feature_number_of_previous_comment_engagement_ratio_1", "engager_feature_number_of_previous_positive_engagement_ratio_1", "engager_feature_number_of_previous_negative_engagement_ratio_1", "engager_feature_number_of_previous_like_engagement_ratio", "engager_feature_number_of_previous_reply_engagement_ratio", "engager_feature_number_of_previous_retweet_engagement_ratio", "engager_feature_number_of_previous_comment_engagement_ratio", "engager_feature_number_of_previous_positive_engagement_ratio", "engager_feature_number_of_previous_negative_engagement_ratio", "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_engager", # "tweet_feature_number_of_previous_like_engagements", # "tweet_feature_number_of_previous_reply_engagements", # "tweet_feature_number_of_previous_retweet_engagements", # "tweet_feature_number_of_previous_comment_engagements", # "tweet_feature_number_of_previous_positive_engagements", # "tweet_feature_number_of_previous_negative_engagements", "creator_feature_number_of_previous_like_engagements_given", "creator_feature_number_of_previous_reply_engagements_given", "creator_feature_number_of_previous_retweet_engagements_given", "creator_feature_number_of_previous_comment_engagements_given", "creator_feature_number_of_previous_positive_engagements_given", "creator_feature_number_of_previous_negative_engagements_given", "creator_feature_number_of_previous_like_engagements_received", "creator_feature_number_of_previous_reply_engagements_received", "creator_feature_number_of_previous_retweet_engagements_received", "creator_feature_number_of_previous_comment_engagements_received", "creator_feature_number_of_previous_positive_engagements_received", "creator_feature_number_of_previous_negative_engagements_received", "engager_feature_number_of_previous_like_engagement_with_language", "engager_feature_number_of_previous_reply_engagement_with_language", "engager_feature_number_of_previous_retweet_engagement_with_language", "engager_feature_number_of_previous_comment_engagement_with_language", "engager_feature_number_of_previous_positive_engagement_with_language", "engager_feature_number_of_previous_negative_engagement_with_language", "engager_feature_knows_hashtag_positive", "engager_feature_knows_hashtag_negative", "engager_feature_knows_hashtag_like", "engager_feature_knows_hashtag_reply", "engager_feature_knows_hashtag_rt", "engager_feature_knows_hashtag_comment", "creator_and_engager_have_same_main_language", "is_tweet_in_creator_main_language", "is_tweet_in_engager_main_language", # "statistical_probability_main_language_of_engager_engage_tweet_language_1", # "statistical_probability_main_language_of_engager_engage_tweet_language_2", "creator_and_engager_have_same_main_grouped_language", "is_tweet_in_creator_main_grouped_language", "is_tweet_in_engager_main_grouped_language", # # "hashtag_similarity_fold_ensembling_positive", # # "link_similarity_fold_ensembling_positive", # # "domain_similarity_fold_ensembling_positive" "tweet_feature_creation_timestamp_hour_shifted", "tweet_feature_creation_timestamp_day_phase", "tweet_feature_creation_timestamp_day_phase_shifted" ] label = [f"tweet_feature_engagement_is_{LABEL}"] train_dataset = "cherry_train" val_dataset = "cherry_val" test_dataset = "new_test" if LABEL in ["like"]: lgbm_params = like_params.lgbm_get_params() xgb_params = like_params.xgb_get_params() elif LABEL in ["reply"]: lgbm_params = reply_params.lgbm_get_params() xgb_params = reply_params.xgb_get_params() elif LABEL in ["retweet"]: lgbm_params = retweet_params.lgbm_get_params() xgb_params = retweet_params.xgb_get_params() elif LABEL in ["comment"]: lgbm_params = comment_params.lgbm_get_params() xgb_params = comment_params.xgb_get_params() else: assert False, "What?" categorical_features_set = set([]) # Load train data # loading_data_start_time = time.time() # df_train, df_train_label = Data.get_dataset_xgb(train_dataset, features, label) # print(f"Loading train data time: {loading_data_start_time - time.time()} seconds") # Load val data df_val, df_val_label = Data.get_dataset_xgb(val_dataset, features, label) # Load test data df_test = Data.get_dataset(features, test_dataset) new_index = pd.Series(df_test.index).map(lambda x: x + len(df_val)) df_test.set_index(new_index, inplace=True) # df to be predicted by the lgbm blending feature df_to_predict = pd.concat([df_val, df_test]) # BLENDING FEATURE DECLARATION feature_list = [] df_train, df_train_label = get_dataset_xgb_batch(total_n_split=1, split_n=0, dataset_id=train_dataset, X_label=features, Y_label=label, sample=0.3) for lgbm_param_dict in lgbm_params: start_time = time.time() feature_list.append( LGBMEnsemblingFeature( dataset_id=train_dataset, df_train=df_train, df_train_label=df_train_label, df_to_predict=df_to_predict, param_dict=lgbm_param_dict, categorical_features_set=categorical_features_set)) for xgb_param_dict in xgb_params: start_time = time.time() df_train, df_train_label = get_dataset_xgb_batch( total_n_split=1, split_n=0, dataset_id=train_dataset, X_label=features, Y_label=label, sample=0.1) feature_list.append( XGBEnsembling( dataset_id=train_dataset, df_train=df_train, df_train_label=df_train_label, df_to_predict=df_to_predict, param_dict=xgb_param_dict, )) df_feature_list = [x.load_or_create() for x in feature_list] # check dimensions len_val = len(df_val) for df_feat in df_feature_list: assert len(df_feat) == (len_val + len(df_test)), \ f"Blending features are not of dimension expected, len val: {len_val} len test: {len(df_test)}\n " \ f"obtained len: {len(df_feat)} of {df_feat.columns[0]}\n" # split feature dataframe in validation and testing df_feat_val_list = [df_feat.iloc[:len_val] for df_feat in df_feature_list] df_feat_test_list = [df_feat.iloc[len_val:] for df_feat in df_feature_list] df_val_to_be_concatenated_list = [df_val ] + df_feat_val_list + [df_val_label] df_test_to_be_concatenated_list = [df_test] + df_feat_test_list # creating the new validation set on which we will do meta optimization df_val = pd.concat(df_val_to_be_concatenated_list, axis=1) df_test = pd.concat(df_test_to_be_concatenated_list, axis=1) # now we are in full meta-model mode # watchout! they are unsorted now, you got to re-sort the dfs df_metatrain, df_metaval = train_test_split(df_val, test_size=0.3) df_metatrain.sort_index(inplace=True) df_metaval.sort_index(inplace=True) # split dataframe columns in train and label col_names_list = [df_feat.columns[0] for df_feat in df_feature_list] extended_features = features + col_names_list df_metatrain_label = df_metatrain[label] df_metatrain = df_metatrain[extended_features] df_metaval_label = df_metaval[label] df_metaval = df_metaval[extended_features] model_name = "lightgbm_classifier" kind = LABEL params = { 'num_leaves': 544, 'max_depth': 7, 'lambda_l1': 50.0, 'lambda_l2': 2.841130937148593, 'colsample_bynode': 0.4, 'colsample_bytree': 1.0, 'bagging_fraction': 1.0, 'bagging_freq': 8, 'min_data_in_leaf': 611, } LGBM = LightGBM( objective='binary', num_threads=-1, num_iterations=1000, early_stopping_rounds=15, **params, ) # LGBM Training training_start_time = time.time() LGBM.fit(X=df_metatrain, Y=df_metatrain_label, X_val=df_metaval, Y_val=df_metaval_label, categorical_feature=set([])) print(f"Training time: {time.time() - training_start_time} seconds") # LGBM Evaluation evaluation_start_time = time.time() prauc, rce, conf, max_pred, min_pred, avg = LGBM.evaluate( df_metaval.to_numpy(), df_metaval_label.to_numpy()) print( "since I'm lazy I did the local test on the same test on which I did EarlyStopping" ) print(f"PRAUC:\t{prauc}") print(f"RCE:\t{rce}") print(f"TN:\t{conf[0, 0]}") print(f"FP:\t{conf[0, 1]}") print(f"FN:\t{conf[1, 0]}") print(f"TP:\t{conf[1, 1]}") print(f"MAX_PRED:\t{max_pred}") print(f"MIN_PRED:\t{min_pred}") print(f"AVG:\t{avg}") print(f"Evaluation time: {time.time() - evaluation_start_time} seconds") tweets = Data.get_feature("raw_feature_tweet_id", test_dataset)["raw_feature_tweet_id"].array users = Data.get_feature("raw_feature_engager_id", test_dataset)["raw_feature_engager_id"].array # LGBM Prediction prediction_start_time = time.time() predictions = LGBM.get_prediction(df_test.to_numpy()) print(f"Prediction time: {time.time() - prediction_start_time} seconds") # Uncomment to plot feature importance at the end of training # LGBM.plot_fimportance() create_submission_file(tweets, users, predictions, f"{LABEL}_lgbm_blending_submission.csv")