def dynamic_threshold_hs_score(user2pred: pd.DataFrame, labeled_users: pd.DataFrame, output_path: str, test_ratio: float): """ :param user2pred: :param labeled_users: :param output_path: :return: """ logger.info("Executing dynamic/adjusted threshold...") output_path = os.path.join(output_path, "soft_threshold") create_dir_if_missing(output_path) user2pred["user_id"] = user2pred["user_id"].astype(str) labeled_users["user_id"] = labeled_users["user_id"].astype(str) user2pred = user2pred[user2pred["user_id"].isin( list(labeled_users["user_id"]))].reset_index(drop=True) avg_hs_score_per_user = user2pred.groupby('user_id').agg({"predictions": "mean"}).reset_index() \ .rename(columns={"predictions": "avg_hs_score"}) avg_hs_score_per_user_with_true = pd.merge(labeled_users, avg_hs_score_per_user, on='user_id') hs_count_per_user = user2pred.groupby('user_id').predictions.agg( get_hs_count).reset_index().rename(columns={"predictions": "hs_count"}) res = pd.DataFrame(columns=[ "lower_bound", "higher_bound", "low_th", "medium_th", "high_th", "f1_score", "precision_score", "recall_score", "accuracy_score" ]) for LOWER_BOUND in tqdm(np.linspace(0.1, 0.4, 1)): for HIGHER_BOUND in np.linspace(0.2, 0.6, 1): if LOWER_BOUND >= HIGHER_BOUND: continue for low_th in range(1, 10, 2): for medium_th in range(2, 50, 3): for high_th in range(3, 300, 2): if low_th >= medium_th or low_th >= high_th or medium_th >= high_th: continue kwargs = { "LOWER_BOUND": LOWER_BOUND, "HIGHER_BOUND": HIGHER_BOUND, "low_th": low_th, "medium_th": medium_th, "high_th": high_th } # avg_hs_score_per_user_with_true_copy = avg_hs_score_per_user_with_true.copy() avg_hs_score_per_user_with_true[ f"soft_threshold_{LOWER_BOUND}_{HIGHER_BOUND}_{low_th}_{medium_th}_{high_th}"] = \ avg_hs_score_per_user_with_true["avg_hs_score"]. \ apply(lambda avg_hs_score: calc_soft_threhold(avg_hs_score, **kwargs)) bound_cols = [ c for c in avg_hs_score_per_user_with_true.columns if 'soft' in c ] y_preds_cols = [f"y_pred_{b_col}" for b_col in bound_cols] avg_hs_score_per_user_with_true = pd.merge(avg_hs_score_per_user_with_true, hs_count_per_user, on='user_id') y_true = avg_hs_score_per_user_with_true["label"] def apply_soft_th_pred(col, hs_count): return hs_count >= col avg_hs_score_per_user_with_true[y_preds_cols] = avg_hs_score_per_user_with_true[bound_cols].\ apply(lambda col: apply_soft_th_pred(col, avg_hs_score_per_user_with_true['hs_count']), axis=0) for col in tqdm(bound_cols): current_bound = col.split("soft_threshold_")[1] avg_hs_score_per_user_with_true[ f"y_pred_{current_bound}"] = avg_hs_score_per_user_with_true.apply( lambda row: 1 if row["hs_count"] >= row[col] else 0, axis=1) y_pred = avg_hs_score_per_user_with_true[f"y_pred_{current_bound}"] f1 = f1_score(y_true, y_pred) precision = precision_score(y_true, y_pred) recall = recall_score(y_true, y_pred) accuracy = accuracy_score(y_true, y_pred) scb = current_bound.split('_') res = res.append( { "lower_bound": scb[0], "higher_bound": scb[1], "low_th": scb[2], "medium_th": scb[3], "high_th": scb[4], "f1_score": f1, "precision_score": precision, "recall_score": recall, "accuracy_score": accuracy }, ignore_index=True) res.to_csv(os.path.join(output_path, "soft_threshold.csv"), index=False) return res
def relational_threshold(user2pred: pd.DataFrame, labeled_users: pd.DataFrame, output_path: str, dataset_name: str, test_ratio: float): """ Here we consider the assumption that relation to followers/followees effect the users' behaviour. For each user - get his average HS score, and the average HS scores of his followers and followees. then search for the optimal relational threshold to yield the best f1-score. This threshold will be combined from a self-TH + followers-TH + followees-TH. :param user2pred: :param labeled_users: :return: """ logger.info("Executing relational threshold...") output_path = os.path.join(output_path, "relational_threshold") create_dir_if_missing(output_path) user2pred["user_id"] = user2pred["user_id"].astype(str) labeled_users["user_id"] = labeled_users["user_id"].astype(str) min_mention_threshold = 3 avg_hs_score_per_user = user2pred.groupby('user_id').agg({"predictions": "mean"}).reset_index() \ .rename(columns={"predictions": "avg_hs_score"}) hs_count_per_user = user2pred.groupby('user_id').predictions.agg( get_hs_count).reset_index().rename(columns={"predictions": "hs_count"}) # get followers/followees network_dir = f"hate_networks/outputs/{dataset_name.split('_')[0]}_networks/network_data/" edges_dir = os.path.join(network_dir, "edges") mentions_df = pd.read_csv(os.path.join(edges_dir, "data_users_mention_edges_df.tsv"), sep='\t') for col in ['source', 'dest']: mentions_df[col] = mentions_df[col].astype(str) # keep only mentions above the minimal threshold mentions_df = mentions_df[ mentions_df["weight"] >= min_mention_threshold].reset_index(drop=True) mentions_dict = {} # users mentioned by the observed user mentioned_by_dict = {} # users mentioning the observed user for idx, row in mentions_df.iterrows(): src = row['source'] dest = row['dest'] if src not in mentions_dict.keys(): mentions_dict[src] = [] if dest not in mentioned_by_dict.keys(): mentioned_by_dict[dest] = [] mentions_dict[src].append(dest) mentioned_by_dict[dest].append(src) res = pd.DataFrame() # SELF_WEIGHT = 0.5 # FOLLOWERS_WEIGHT = 0.25 # FOLLOWEES_WEIGHT = 0.25 for SELF_WEIGHT in np.linspace(0, 1, num=5): for FOLLOWERS_WEIGHT in np.linspace(0, 1, num=5): if SELF_WEIGHT + FOLLOWERS_WEIGHT >= 1: continue else: FOLLOWEES_WEIGHT = 1.0 - SELF_WEIGHT - FOLLOWERS_WEIGHT # logger.info(f"self-weight: {SELF_WEIGHT:.2f}, followers-weight: {FOLLOWERS_WEIGHT:.2f}, followees-weight: {FOLLOWEES_WEIGHT:.2f}") user_ids = [] relational_scores = [] type_counts = {1: 0, 2: 0, 3: 0, 4: 0} for user_id in labeled_users["user_id"].tolist(): user_ids.append(user_id) has_followees = True has_followers = True if user_id in mentions_dict.keys(): current_followees = mentions_dict[user_id] followees_df = hs_count_per_user.loc[ hs_count_per_user["user_id"].isin(current_followees ), "hs_count"] if len(followees_df) == 0: has_followees = False else: followees_hs_counts = followees_df.mean() else: has_followees = False if user_id in mentioned_by_dict.keys(): current_followers = mentioned_by_dict[user_id] followers_df = hs_count_per_user.loc[ hs_count_per_user["user_id"].isin(current_followers ), "hs_count"] if len(followers_df) == 0: has_followers = False else: followers_hs_counts = followers_df.mean() else: has_followers = False user_hs_count = int(hs_count_per_user.loc[ hs_count_per_user["user_id"] == user_id, "hs_count"].iloc[0]) if has_followers and has_followees: type_counts[1] += 1 current_score = SELF_WEIGHT * user_hs_count + FOLLOWEES_WEIGHT * followees_hs_counts + FOLLOWERS_WEIGHT * followers_hs_counts elif has_followees and not has_followers: type_counts[2] += 1 current_score = SELF_WEIGHT * user_hs_count + FOLLOWEES_WEIGHT * followees_hs_counts elif not has_followees and has_followers: type_counts[3] += 1 current_score = SELF_WEIGHT * user_hs_count + FOLLOWERS_WEIGHT * followers_hs_counts else: type_counts[4] += 1 current_score = SELF_WEIGHT * user_hs_count relational_scores.append(current_score) logger.info(type_counts) user2relational_score = pd.DataFrame({ "user_id": user_ids, "relational_score": relational_scores }) train_idx = user2relational_score.sample( frac=(1 - test_ratio)).index train_user2relational_score = user2relational_score.loc[ train_idx] max_f1 = 0.0 best_th = 0 for threshold in tqdm(range(1, 300)): # to_plot["thresholds"].append(threshold) train_user2relational_score[ "y_pred"] = train_user2relational_score[ "relational_score"].apply( lambda rs: 1 if rs >= threshold else 0) true_pred = pd.merge(labeled_users, train_user2relational_score, on='user_id') y_true = true_pred["label"] y_pred = true_pred["y_pred"] current_f1_score = f1_score(y_true, y_pred) if max_f1 < current_f1_score: max_f1 = current_f1_score best_th = threshold logger.info(f"Max f1-score: {max_f1}") logger.info(f"Best threshold: {best_th}") res = res.append( { "self_weight": SELF_WEIGHT, "followers_weight": FOLLOWERS_WEIGHT, "followees_weight": FOLLOWEES_WEIGHT, "best_f1_score": max_f1, "best_th": best_th }, ignore_index=True) res.to_csv(os.path.join(output_path, "relational_threshold_grid_search.csv"), index=False)
def fixed_threshold_num_of_posts(user2pred: pd.DataFrame, labeled_users: pd.DataFrame, output_path: str, test_ratio: float): """ Hard threshold of number of HS predictions per user. Threshold is an integer and above 1. :param user2pred: :param labeled_users: :return: """ logger.info("Executing fixed threshold...") output_path = os.path.join(output_path, "hard_threshold") create_dir_if_missing(output_path) user2pred["user_id"] = user2pred["user_id"].astype(str) labeled_users["user_id"] = labeled_users["user_id"].astype(str) train_idx = labeled_users.sample(frac=(1 - test_ratio)).index train_labeled_users = labeled_users.loc[train_idx] test_labeled_users = labeled_users.drop(train_labeled_users.index, axis=0) train_user2pred = user2pred[user2pred["user_id"].isin( list(train_labeled_users["user_id"]))].reset_index(drop=True) test_user2pred = user2pred[user2pred["user_id"].isin( list(test_labeled_users["user_id"]))].reset_index(drop=True) train_g_df = train_user2pred.groupby('user_id').predictions.agg( get_hs_count).reset_index().rename(columns={"predictions": "hs_count"}) test_g_df = test_user2pred.groupby('user_id').predictions.agg( get_hs_count).reset_index().rename(columns={"predictions": "hs_count"}) to_plot = { "thresholds": [], "f-scores": [], "precisions": [], "recalls": [], "accuracies": [] } max_f1 = 0.0 best_th = 0 for threshold in tqdm(range(1, 300)): to_plot["thresholds"].append(threshold) train_g_df["y_pred"] = train_g_df["hs_count"].apply( lambda h_count: 1 if h_count >= threshold else 0) true_pred = pd.merge(train_labeled_users, train_g_df, on='user_id') y_true = true_pred["label"] y_pred = true_pred["y_pred"] current_f1_score = f1_score(y_true, y_pred) if max_f1 < current_f1_score: max_f1 = current_f1_score best_th = threshold to_plot["f-scores"].append(current_f1_score) to_plot["precisions"].append(precision_score(y_true, y_pred)) to_plot["recalls"].append(recall_score(y_true, y_pred)) to_plot["accuracies"].append(accuracy_score(y_true, y_pred)) plt.figure() sns.set(rc={'figure.figsize': (6, 6)}, font_scale=1.7) for score_ in ["f-score", "precision", "recall", "accuracy"]: current_score_name = "accuracies" if score_.endswith( "y") else f"{score_}s" if score_ != "recall": sns.lineplot(to_plot["thresholds"], to_plot[current_score_name], label=f"{score_}" if score_ != 'f-score' else f"{score_} (max: {max(to_plot['f-scores']):.3f})") else: sns.lineplot(to_plot["thresholds"], to_plot[current_score_name], label=f"{score_}") plt.title("Fixed threshold") plt.xlabel('Threshold') plt.ylabel('Measurement score') plt.savefig(os.path.join(output_path, "hard_threshold_plot.png")) pd.DataFrame(to_plot).to_csv(os.path.join(output_path, "hard_threshold.csv"), index=False) logger.info(f"Max f1-score: {max_f1}") logger.info(f"Best threshold: {best_th}") # evaluate on test test_g_df["y_pred"] = test_g_df["hs_count"].apply( lambda h_count: 1 if h_count >= best_th else 0) true_pred = pd.merge(test_labeled_users, test_g_df, on='user_id') y_true = true_pred["label"] y_pred = true_pred["y_pred"] with open(os.path.join(output_path, "evaluation.txt"), "w") as fout: fout.write(f"F1-score: {f1_score(y_true, y_pred):.3f}\n") fout.write(f"Precision: {precision_score(y_true, y_pred):.3f}\n") fout.write(f"Recall: {recall_score(y_true, y_pred):.3f}\n") fout.write(f"Accuracy: {accuracy_score(y_true, y_pred):.3f}\n") fout.write( f"Balanced Accuracy: {balanced_accuracy_score(y_true, y_pred):.3f}" )
def run_ulm_experiment(): """ Main function to run the ULM :return: """ trained_data = user_level_execution_config["trained_data"] inference_data = user_level_execution_config["inference_data"] model = post_level_execution_config["kwargs"]["model_name"] model_path = f"detection/outputs/{trained_data}/{model}/" user_level_path = os.path.join(model_path, "user_level") create_dir_if_missing(user_level_path) if trained_data != inference_data: inference_path = os.path.join(user_level_path, inference_data) else: inference_path = user_level_path create_dir_if_missing(inference_path) if 'split_by_posts' in os.listdir(inference_path): all_posts_probs_df_path = os.path.join(inference_path, "split_by_posts", "no_text") else: all_posts_probs_df_path = os.path.join(inference_path, "user2pred.parquet") if not os.path.exists( all_posts_probs_df_path ): # for the first time running this data - predict all posts for all users. predict_all_users(trained_data, inference_data, inference_path) io_path = os.path.join(inference_path, "io") create_dir_if_missing(io_path) data_conf = user_level_conf[inference_data] data_path = data_conf["data_path"] user_column = data_conf["user_unique_column"] file_ending = data_path.split(".")[-1] if file_ending == 'csv': sep = ',' elif file_ending == 'tsv': sep = '\t' else: raise ValueError(f"wrong ending for file {data_path}") user_df = pd.read_csv(data_path, sep=sep) user_df[user_column] = user_df[user_column].astype(str) user_df.to_csv(os.path.join(io_path, "labeled_users_df.tsv"), sep='\t', index=False) if os.path.exists(os.path.join(io_path, "inputs.pkl")): logger.info( f"Inputs-outputs already exists for dataset {inference_data}. Loading them from {io_path}..." ) inputs = pickle.load(open(os.path.join(io_path, "inputs.pkl"), "rb")) output = pickle.load(open(os.path.join(io_path, "outputs.pkl"), "rb")) else: inputs, output = prepare_inputs_outputs(inference_data, all_posts_probs_df_path, output_path=user_level_path, only_inference=False) features_to_use = ["self", "followings", "followers", "network" ] # "self", "followings", "followers", "network" test_size = 0.2 normalize_features = True PREDICT_ALL_DATA_USERS = False if not PREDICT_ALL_DATA_USERS: result = pd.DataFrame(columns=features_to_use + ["model"] + [ metric.__name__ for metric in [f1_score, accuracy_score, recall_score, precision_score, auc] ]) # logger.info(run_user_model(inputs, output, features_to_use=["self", "followings"], output_path=user_level_path, # model_type="nn", normalize_features=True)) for model_type in ["nn"]: # ,"lr", "catboost", "lightgbm", "xgboost", logger.info(f"Executing {model_type} model...") logger.info(model_type) for r in range(1, len(features_to_use) + 1): for fc in combinations(features_to_use, r): res_row = run_user_model( inputs, output, features_to_use=fc, output_path=user_level_path, model_type=model_type, normalize_features=normalize_features, test_size=test_size) result = result.append(res_row, ignore_index=True) result.to_csv(os.path.join( user_level_path, f"user_level_results__{int(test_size * 100)}_test.tsv"), sep='\t', index=False) else: if not os.path.exists( os.path.join(user_level_path, "best_user_model.model") ): # best model doesn't exist - search for it result = pd.DataFrame(columns=features_to_use + ["model"] + [ metric.__name__ for metric in [f1_score, accuracy_score, recall_score, precision_score, auc] ]) # logger.info(run_user_model(inputs, output, features_to_use=["self", "followings"], output_path=user_level_path, # model_type="nn", normalize_features=True)) for model_type in ["lr", "catboost", "lightgbm", "xgboost", "nn"]: # , logger.info(f"Executing {model_type} model...") logger.info(model_type) for r in range(1, len(features_to_use) + 1): for fc in combinations(features_to_use, r): res_row = run_user_model( inputs, output, features_to_use=fc, output_path=user_level_path, model_type=model_type, normalize_features=normalize_features, test_size=test_size) result = result.append(res_row, ignore_index=True) result.to_csv(os.path.join( user_level_path, f"user_level_results__{int(test_size*100)}_test_power_transformed.tsv" ), sep='\t', index=False) sorted_f1_results = result.sort_values( 'f1_score', ascending=False).reset_index(drop=True) best_row = sorted_f1_results.iloc[0] best_model = best_row["model"] best_features = features_to_use.copy() if not best_row["self"]: best_features.remove('self') if not best_row["followings"]: best_features.remove('followings') if not best_row["followers"]: best_features.remove('followers') if not best_row["network"]: best_features.remove('network') # fit best performing model on all data and save it. run_user_model(inputs, output, features_to_use=best_features, output_path=user_level_path, model_type=best_model, normalize_features=normalize_features, test_size=None) else: models_results = pd.read_csv(os.path.join( user_level_path, f"user_level_results__{int(test_size*100)}_test_power_transformed.tsv" ), sep='\t') sorted_f1_results = models_results.sort_values( 'f1_score', ascending=False).reset_index(drop=True) best_row = sorted_f1_results.iloc[0] best_model = best_row["model"] best_features = features_to_use.copy() if not best_row["self"]: best_features.remove('self') if not best_row["followings"]: best_features.remove('followings') if not best_row["followers"]: best_features.remove('followers') if not best_row["network"]: best_features.remove('network') if os.path.exists(os.path.join(io_path, "only_inputs.pkl")): logger.info( f"only_inputs already exists for dataset {inference_data}. Loading them from {io_path}..." ) only_inputs = pickle.load( open(os.path.join(io_path, "only_inputs.pkl"), "rb")) else: only_inputs, _ = prepare_inputs_outputs( inference_data, all_posts_probs_df_path, output_path=user_level_path, only_inference=True) predict_all_users_labels(only_inputs, best_features, model_type=best_model, model_path=os.path.join( user_level_path, "best_user_model.model"), normalize_features=normalize_features, output_path=user_level_path)
def predict_all_users(trained_dataset_name, inference_dataset_name, user_level_path): logger.info( f"predicting all users for dataset: {inference_dataset_name} using models train on {trained_dataset_name} data" ) logger.info(f"predicting all users for dataset: {inference_dataset_name}") post_level_data_conf = post_level_conf[trained_dataset_name] labels = post_level_data_conf["labels"] labels_interpretation = post_level_data_conf["labels_interpretation"] post_level_execution_config["kwargs"]["labels"] = labels post_level_execution_config["kwargs"][ "labels_interpretation"] = labels_interpretation # load preprocessor and model pt, post_model = load_post_model(**post_level_execution_config) # get posts per user # if os.path.exists(os.path.join(user_level_path, "X_test.pkl")): # X_test = pickle.load(open(os.path.join(user_level_path, "X_test.pkl"), "rb")) # else: if not os.path.exists( os.path.join(user_level_path, "all_users_tweets.parquet")): logger.info( f"reading all posts of {inference_dataset_name} dataset...") posts_per_user_dict = pickle.load( open( user_level_conf[inference_dataset_name]["posts_per_user_path"], "rb")) for k, v in posts_per_user_dict.items(): posts_per_user_dict[k] = [p.strip() for p in v if p.strip() != ''] # omit empty posts logger.info( f"finished reading all posts of {inference_dataset_name} dataset") # i = 0 if len(posts_per_user_dict[list( posts_per_user_dict.keys())[0]][0]) == 2: full_df = pd.DataFrame(columns=['user_id', 'post_id', 'text']) for user_id, user_posts in tqdm(posts_per_user_dict.items()): user_id = str(user_id) current_user_df = pd.DataFrame({ 'user_id': [user_id for _ in range(len(user_posts))], 'post_id': [user_post_tup[0] for user_post_tup in user_posts], 'text': [ user_post_tup[1].strip() for user_post_tup in user_posts if user_post_tup[1].strip() != '' ], }) full_df = full_df.append(current_user_df, ignore_index=True) else: full_df = pd.DataFrame(columns=['user_id', 'text']) for user_id, user_posts in tqdm(posts_per_user_dict.items()): user_id = str(user_id) user_posts = [p.strip() for p in user_posts if p.strip() != ''] current_user_df = pd.DataFrame({ 'user_id': [user_id for _ in range(len(user_posts))], 'text': user_posts, }) full_df = full_df.append(current_user_df, ignore_index=True) full_df.to_parquet(os.path.join(user_level_path, "all_users_tweets.parquet"), index=False) else: # tweets per user df already exists logger.info("reading all_users_tweets.parquet file...") full_df = pd.read_parquet( os.path.join(user_level_path, "all_users_tweets.parquet")) logger.info(f"full_df shape: {full_df.shape}") full_df = full_df[full_df["text"].apply( lambda t: t.strip() != "")].reset_index(drop=True) # first_10000_users = list(full_df["user_id"].unique())[:10000] # full_df = full_df[full_df["user_id"].isin(first_10000_users)] logger.info( f"full_df shape after removing empty posts: {full_df.shape}") logger.info("file read.") # SPLITTING TO CHUNKS BY TWEETS chunk_size = 1000000 logger.info(f"preprocessing in chunks of {chunk_size}...") logger.info(f"Length of full_df: {len(full_df)}") for user_range in range(0, len(full_df), chunk_size): current_full_df = full_df.loc[user_range:user_range + chunk_size - 1] current_X = current_full_df["text"] _, X_test, _, _, _, _ = pt.full_preprocessing(current_X, None, mode='test') logger.info( f"predicting users tweets; indices: {user_range} to {user_range+chunk_size-1}..." ) y_proba = post_model.predict_proba(X_test) current_full_df.loc[:, 'predictions'] = y_proba create_dir_if_missing(os.path.join(user_level_path, "split_by_posts")) create_dir_if_missing( os.path.join(user_level_path, "split_by_posts", "no_text")) create_dir_if_missing( os.path.join(user_level_path, "split_by_posts", "with_text")) logger.info(f"saving predictions to {user_level_path}") current_full_df[['user_id', 'predictions']].to_parquet(os.path.join( user_level_path, "split_by_posts", "no_text", f"user2pred_min_idx_{user_range}_max_idx_{user_range+chunk_size-1}.parquet" ), index=False) current_full_df.to_parquet(os.path.join( user_level_path, "split_by_posts", "with_text", f"user2pred_with_text_min_idx_{user_range}_max_idx_{user_range+chunk_size-1}.parquet" ), index=False)
def prepare_inputs_outputs(dataset_name, all_posts_probs_df_path, output_path, only_inference=False): """ predict whether a user is a hatemonger or not by his tweets and his friends' (mentioned by the user and mentioning the user) tweets. :return: """ logger.info(f"Preparing inputs outputs for {dataset_name} dataset...") # user level config data_conf = user_level_conf[dataset_name] if "data_path" in data_conf.keys(): data_path = data_conf["data_path"] user_column = data_conf["user_unique_column"] label_column = data_conf["label_column"] labels = data_conf["labels"] labels_interpretation = data_conf["labels_interpretation"] if only_inference: # get all users in the network for inference user_df = pd.read_csv( f"hate_networks/{dataset_name.split('_')[0]}_networks/tsv_data/users.tsv", sep="\t") if dataset_name == 'gab': user_df = user_df.sample( n=10000) # too many users in gab net... sample 10K user_df[user_column] = user_df[user_column].astype(str) else: file_ending = data_path.split(".")[-1] if file_ending == 'csv': sep = ',' elif file_ending == 'tsv': sep = '\t' else: raise ValueError(f"wrong ending for file {data_path}") user_df = pd.read_csv(data_path, sep=sep) user_df[user_column] = user_df[user_column].astype(str) network_dir = f"hate_networks/outputs/{dataset_name.split('_')[0]}_networks/network_data/" # load centralities features for each user centralities_df = pd.read_csv( os.path.join( network_dir, "features", "centralities_mention_edges_filtered_singletons_filtered.tsv"), sep='\t') # centralities_mention_all_edges_all_nodes.tsv centralities_df[user_column] = centralities_df[user_column].astype(str) centrality_measurements = [ col for col in centralities_df.columns if col != 'user_id' ] min_mention_threshold = 3 following_fn = data_conf["following_fn"] mentions_dict, mentioned_by_dict = get_followers_followees_dicts( network_dir, following_fn, min_mention_threshold) # take mean of lengths as maximum length of feature array # consider average of lengths only from labeled users! labeled_mentions_dict = { key: val for key, val in mentions_dict.items() if key in user_df.user_id.tolist() } labeled_mentioned_by_dict = { key: val for key, val in mentioned_by_dict.items() if key in user_df.user_id.tolist() } max_followings_num = int( np.mean([len(x) for x in labeled_mentions_dict.values()])) max_followers_num = int( np.mean([len(x) for x in labeled_mentioned_by_dict.values()])) if all_posts_probs_df_path.endswith("parquet") or os.path.isdir( all_posts_probs_df_path): all_users_probs = pd.read_parquet(all_posts_probs_df_path) else: all_users_probs = pd.read_csv(all_posts_probs_df_path, sep='\t', engine='python') labeled_users_predictions = all_users_probs[ all_users_probs['user_id'].isin(user_df.user_id.tolist())] max_user_tweets = int( labeled_users_predictions.groupby('user_id').size().reset_index() [0].mean()) self_input = [] followings_input = [] followers_input = [] network_features_input = [] outputs = [] all_users_list = list(all_users_probs["user_id"].unique()) for idx, row in tqdm(user_df.iterrows()): user_id = str(row[user_column]) if user_id not in centralities_df["user_id"].tolist(): current_network_features = np.zeros(len(centrality_measurements)) else: current_network_features = [] for cm in centrality_measurements: current_network_features.append( centralities_df.loc[centralities_df["user_id"] == user_id, cm].iloc[0]) network_features_input.append(np.array(current_network_features)) if not only_inference: outputs.append(row[label_column]) avg_followings_predictions = [] avg_followers_predictions = [] if user_id in mentions_dict.keys(): followings = mentions_dict[ user_id] # users mentioned by the observed user followings = [ followee for followee in followings if followee in all_users_list ] if len(followings) > 0: followings_predictions = all_users_probs.loc[ all_users_probs["user_id"].isin(followings)] avg_followings_predictions = followings_predictions.groupby( 'user_id').agg({'predictions': 'mean'})['predictions'] if user_id in mentioned_by_dict.keys(): followers = mentioned_by_dict[ user_id] # users mentioning the observed user followers = [ follower for follower in followers if follower in all_users_list ] if len(followers) > 0: followers_predictions = all_users_probs.loc[ all_users_probs["user_id"].isin(followers)] avg_followers_predictions = followers_predictions.groupby( 'user_id').agg({'predictions': 'mean'})['predictions'] self_predictions = all_users_probs.loc[all_users_probs["user_id"] == user_id, "predictions"] # handle followings/followers predictions (average them) self_predictions = list(self_predictions) avg_followings_predictions = list(avg_followings_predictions) avg_followers_predictions = list(avg_followers_predictions) # enough when max is really max (not average) if len(self_predictions) < max_user_tweets: self_predictions.extend([0.0] * (max_user_tweets - len(self_predictions))) if len(avg_followings_predictions) < max_followings_num: avg_followings_predictions.extend( [0.0] * (max_followings_num - len(avg_followings_predictions))) if len(avg_followers_predictions) < max_followers_num: avg_followers_predictions.extend( [0.0] * (max_followers_num - len(avg_followers_predictions))) # when max is an average, we need to also remove some of the predictions for users with more than the avg if len(self_predictions) > max_user_tweets: self_predictions = self_predictions[:max_user_tweets] if len(avg_followings_predictions) > max_followings_num: avg_followings_predictions = avg_followings_predictions[: max_followings_num] if len(avg_followers_predictions) > max_followers_num: avg_followers_predictions = avg_followers_predictions[: max_followers_num] self_input.append(np.array(self_predictions)) followings_input.append(np.array(avg_followings_predictions)) followers_input.append(np.array(avg_followers_predictions)) self_input = np.array(self_input, dtype='float32') followings_input = np.array(followings_input, dtype='float32') followers_input = np.array(followers_input, dtype='float32') network_features_input = np.array(network_features_input, dtype='float32') # user_model = build_user_model(max_user_tweets, max_followings_num, max_followers_num, len(centrality_measurements)) all_inputs = [ self_input, followings_input, followers_input, network_features_input ] io_path = os.path.join(output_path, "io") create_dir_if_missing(io_path) if not only_inference: outputs = np.array(outputs) pickle.dump(all_inputs, open(os.path.join(io_path, "inputs.pkl"), "wb")) pickle.dump(outputs, open(os.path.join(io_path, "outputs.pkl"), "wb")) user_df.to_csv(os.path.join(io_path, "labeled_users_df.tsv"), sep='\t', index=False) else: pickle.dump(all_inputs, open(os.path.join(io_path, "only_inputs.pkl"), "wb")) user_df.to_csv(os.path.join(io_path, "all_users_df.tsv"), sep='\t', index=False) return all_inputs, outputs
def run_user_model(X, y, features_to_use, output_path, model_type="nn", normalize_features=True, test_size=0.2): input_features_mapping = { "self": 0, "followings": 1, "followers": 2, "network": 3 } relevant_features_idx = [ v for k, v in input_features_mapping.items() if k in features_to_use ] res_row = {f: False for f in input_features_mapping.keys()} for f in features_to_use: res_row[f] = True res_row["model"] = model_type if test_size is None: ## train with all data for best performing configuration. X_train, _, y_train, _ = prepare_data_for_modeling( X, y, relevant_features_idx, normalize_features, test_size, output_path) else: X_train, X_test, y_train, y_test = prepare_data_for_modeling( X, y, relevant_features_idx, normalize_features, test_size, output_path) if model_type == "nn": user_model = build_user_model(X[0].shape[1], X[1].shape[1], X[2].shape[1], X[3].shape[1], relevant_features_idx) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=10, verbose=1, mode='auto', restore_best_weights=True) class_weight = compute_class_weight('balanced', np.unique(y_train), y_train) hist = user_model.fit( x=X_train, y=y_train, batch_size=128, epochs=60, validation_split=0.2, verbose=0, callbacks=[], class_weight={i: class_weight[i] for i in range(2)}) if test_size is None: user_model.save(os.path.join(output_path, "best_user_model.model"), save_format='tf') else: X_train = np.hstack(tuple([input for input in X_train])) if model_type == "catboost": # requires concatenation in advance user_model = CatBoostClassifier() user_model.fit(X_train, y_train, verbose=False) elif model_type == "lightgbm": user_model = lgb.LGBMClassifier() user_model.fit(X_train, y_train, verbose=False) elif model_type == "lr": user_model = LogisticRegressionCV(cv=5, max_iter=10000) user_model.fit(X_train, y_train) elif model_type == "xgboost": user_model = XGBClassifier() user_model.fit(X_train, y_train, verbose=False) if test_size is None: pickle.dump( user_model, open(os.path.join(output_path, "best_user_model.model"), "wb")) # evaluation phase if test_size is not None: if model_type == "nn": y_score = user_model.predict(X_test) fp, tp, th = roc_curve(y_true=y_test, y_score=y_score[:, -1]) res_row[auc.__name__] = auc(fp, tp) y_pred = (y_score > 0.5).astype('int32') for metric in [ f1_score, accuracy_score, recall_score, precision_score ]: # logger.info(f"{metric.__name__}: {metric(y_test, y_pred):.2f}") res_row[metric.__name__] = metric(y_test, y_pred) # test_loss, test_acc, test_f1, test_precision, test_recall = user_model.evaluate(X_test, y_test) test_loss, test_f1 = user_model.evaluate(X_test, y_test) logger.info(f"test loss: {test_loss}") # logger.info(f"test acc: {test_acc}") logger.info(f"test f1: {test_f1}") # logger.info(f"test precision: {test_precision}") # logger.info(f"test recall: {test_recall}") ## save the model # user_model.save_weights(model_weights_file_path) # save model's weights # self.model.save(model_file_path, save_format='tf') # save full model # pickle.dump(self, open(full_model_file_path, "wb")) ## save plots of loss and accuracy during training training_output_path = os.path.join(output_path, "training") create_dir_if_missing(training_output_path) plt.figure() plt.title('Loss') plt.plot(hist.history['loss'], label='train') plt.plot(hist.history['val_loss'], label='validation') plt.legend() loss_fn = os.path.join(training_output_path, "loss_graph.png") plt.savefig(loss_fn) # plt.figure() # plt.title('Accuracy') # plt.plot(hist.history['accuracy'], label='train') # plt.plot(hist.history['val_accuracy'], label='validation') # plt.legend() # acc_fn = os.path.join(training_output_path, "acc_graph.png") # plt.savefig(acc_fn) else: X_test = np.hstack(tuple([input for input in X_test])) y_score = user_model.predict_proba(X_test)[:, -1] fp, tp, th = roc_curve(y_true=y_test, y_score=y_score) res_row[auc.__name__] = auc(fp, tp) y_pred = user_model.predict(X_test) for metric in [ f1_score, accuracy_score, recall_score, precision_score ]: # logger.info(f"{metric.__name__}: {metric(y_test, y_pred):.2f}") res_row[metric.__name__] = metric(y_test, y_pred) return res_row
def fit(self, X_train, y_train): super().fit(X_train, y_train) train_output_path = self.paths['train_output'] model_output_path = self.paths['model_output'] create_dir_if_missing(train_output_path) create_dir_if_missing(model_output_path) log_dir = os.path.join( train_output_path, 'logs', datetime.datetime.now().strftime("%Y-%m-%d%--H:%M:%s")) tensorboard_callback = TensorBoard(log_dir=log_dir) epochs = self.kwargs['epochs'] validation_split = self.kwargs['validation_split'] model_weights_file_path = os.path.join(model_output_path, "weights_best.h5") # model_file_path = os.path.join(model_output_path, "model.h5") # full_model_file_path = os.path.join(model_output_path, "full_model.pkl") checkpoint = ModelCheckpoint(model_weights_file_path, monitor='val_loss', verbose=2, save_best_only=True, mode='min') if self.name == 'BertAttentionLSTM': reduce_lr = self.create_learning_rate_scheduler( max_learn_rate=1e-5, end_learn_rate=1e-7, warmup_epoch_count=20, total_epoch_count=epochs) else: reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.0001, verbose=2) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=5, verbose=2, mode='auto', restore_best_weights=True) callbacks = [ reduce_lr, early_stopping ] # not using checkpoint since it's saving the model fully # class weights (handling imabalanced data) if len(y_train.shape) > 1: y_train_for_class_weight = y_train[ y_train == 1].stack().reset_index().drop( 0, 1).set_index('level_0').rename( columns={"level_1": "label"})["label"] else: y_train_for_class_weight = y_train.copy() class_weights = compute_class_weight( "balanced", np.unique(y_train_for_class_weight), list(y_train_for_class_weight)) train_class_weights = dict( zip(np.unique(y_train_for_class_weight), class_weights)) print(f"Using GPU: {tf.config.list_physical_devices('GPU')}") # fit the model # with tf.device("/device:GPU:0"): hist = self.model.fit(X_train, y_train, batch_size=32, epochs=epochs, validation_split=validation_split, class_weight=train_class_weights, callbacks=callbacks) # save the model self.model.save_weights( model_weights_file_path) # save model's weights # self.model.save(model_file_path, save_format='tf') # save full model # pickle.dump(self, open(full_model_file_path, "wb")) # save plots of loss and accuracy during training hist_path = os.path.join(train_output_path, "history.pkl") with open(hist_path, "wb") as fout: pickle.dump(hist.history, fout) plt.figure() plt.title('Loss per epoch') plt.plot(hist.history['loss'], label='train') plt.plot(hist.history['val_loss'], label='validation') plt.legend() loss_fn = os.path.join(train_output_path, "loss_graph.png") plt.savefig(loss_fn) if 'accuracy' in hist.history.keys(): plt.figure() plt.title('Accuracy per epoch') plt.plot(hist.history['accuracy'], label='train') plt.plot(hist.history['val_accuracy'], label='validation') plt.legend() acc_fn = os.path.join(train_output_path, "acc_graph.png") plt.savefig(acc_fn) if 'f1_m' in hist.history.keys(): plt.figure() plt.title('F1-score per epoch') plt.plot(hist.history['f1_m'], label='train') plt.plot(hist.history['val_f1_m'], label='validation') plt.legend() f1_fn = os.path.join(train_output_path, "f1_graph.png") plt.savefig(f1_fn) plt.figure() plt.title('Learning rate per epoch') plt.plot(hist.history['lr'], label='lr') plt.legend() lr_fn = os.path.join(train_output_path, "lr_graph.png") plt.savefig(lr_fn)