def dynamic_threshold_hs_score(user2pred: pd.DataFrame,
                               labeled_users: pd.DataFrame, output_path: str,
                               test_ratio: float):
    """

    :param user2pred:
    :param labeled_users:
    :param output_path:
    :return:
    """
    logger.info("Executing dynamic/adjusted threshold...")
    output_path = os.path.join(output_path, "soft_threshold")
    create_dir_if_missing(output_path)
    user2pred["user_id"] = user2pred["user_id"].astype(str)
    labeled_users["user_id"] = labeled_users["user_id"].astype(str)
    user2pred = user2pred[user2pred["user_id"].isin(
        list(labeled_users["user_id"]))].reset_index(drop=True)
    avg_hs_score_per_user = user2pred.groupby('user_id').agg({"predictions": "mean"}).reset_index() \
        .rename(columns={"predictions": "avg_hs_score"})
    avg_hs_score_per_user_with_true = pd.merge(labeled_users,
                                               avg_hs_score_per_user,
                                               on='user_id')

    hs_count_per_user = user2pred.groupby('user_id').predictions.agg(
        get_hs_count).reset_index().rename(columns={"predictions": "hs_count"})

    res = pd.DataFrame(columns=[
        "lower_bound", "higher_bound", "low_th", "medium_th", "high_th",
        "f1_score", "precision_score", "recall_score", "accuracy_score"
    ])

    for LOWER_BOUND in tqdm(np.linspace(0.1, 0.4, 1)):
        for HIGHER_BOUND in np.linspace(0.2, 0.6, 1):
            if LOWER_BOUND >= HIGHER_BOUND:
                continue
            for low_th in range(1, 10, 2):
                for medium_th in range(2, 50, 3):
                    for high_th in range(3, 300, 2):
                        if low_th >= medium_th or low_th >= high_th or medium_th >= high_th:
                            continue
                        kwargs = {
                            "LOWER_BOUND": LOWER_BOUND,
                            "HIGHER_BOUND": HIGHER_BOUND,
                            "low_th": low_th,
                            "medium_th": medium_th,
                            "high_th": high_th
                        }
                        #                         avg_hs_score_per_user_with_true_copy = avg_hs_score_per_user_with_true.copy()
                        avg_hs_score_per_user_with_true[
                            f"soft_threshold_{LOWER_BOUND}_{HIGHER_BOUND}_{low_th}_{medium_th}_{high_th}"] = \
                        avg_hs_score_per_user_with_true["avg_hs_score"]. \
                            apply(lambda avg_hs_score: calc_soft_threhold(avg_hs_score, **kwargs))

    bound_cols = [
        c for c in avg_hs_score_per_user_with_true.columns if 'soft' in c
    ]
    y_preds_cols = [f"y_pred_{b_col}" for b_col in bound_cols]
    avg_hs_score_per_user_with_true = pd.merge(avg_hs_score_per_user_with_true,
                                               hs_count_per_user,
                                               on='user_id')
    y_true = avg_hs_score_per_user_with_true["label"]

    def apply_soft_th_pred(col, hs_count):
        return hs_count >= col
    avg_hs_score_per_user_with_true[y_preds_cols] = avg_hs_score_per_user_with_true[bound_cols].\
        apply(lambda col: apply_soft_th_pred(col, avg_hs_score_per_user_with_true['hs_count']), axis=0)

    for col in tqdm(bound_cols):

        current_bound = col.split("soft_threshold_")[1]
        avg_hs_score_per_user_with_true[
            f"y_pred_{current_bound}"] = avg_hs_score_per_user_with_true.apply(
                lambda row: 1 if row["hs_count"] >= row[col] else 0, axis=1)

        y_pred = avg_hs_score_per_user_with_true[f"y_pred_{current_bound}"]

        f1 = f1_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        scb = current_bound.split('_')
        res = res.append(
            {
                "lower_bound": scb[0],
                "higher_bound": scb[1],
                "low_th": scb[2],
                "medium_th": scb[3],
                "high_th": scb[4],
                "f1_score": f1,
                "precision_score": precision,
                "recall_score": recall,
                "accuracy_score": accuracy
            },
            ignore_index=True)

    res.to_csv(os.path.join(output_path, "soft_threshold.csv"), index=False)
    return res
def relational_threshold(user2pred: pd.DataFrame, labeled_users: pd.DataFrame,
                         output_path: str, dataset_name: str,
                         test_ratio: float):
    """
    Here we consider the assumption that relation to followers/followees effect the users' behaviour.
    For each user - get his average HS score, and the average HS scores of his followers and followees.
    then search for the optimal relational threshold to yield the best f1-score.
    This threshold will be combined from a self-TH + followers-TH + followees-TH.

    :param user2pred:
    :param labeled_users:
    :return:
    """
    logger.info("Executing relational threshold...")
    output_path = os.path.join(output_path, "relational_threshold")
    create_dir_if_missing(output_path)
    user2pred["user_id"] = user2pred["user_id"].astype(str)
    labeled_users["user_id"] = labeled_users["user_id"].astype(str)

    min_mention_threshold = 3
    avg_hs_score_per_user = user2pred.groupby('user_id').agg({"predictions": "mean"}).reset_index() \
        .rename(columns={"predictions": "avg_hs_score"})
    hs_count_per_user = user2pred.groupby('user_id').predictions.agg(
        get_hs_count).reset_index().rename(columns={"predictions": "hs_count"})

    # get followers/followees
    network_dir = f"hate_networks/outputs/{dataset_name.split('_')[0]}_networks/network_data/"
    edges_dir = os.path.join(network_dir, "edges")
    mentions_df = pd.read_csv(os.path.join(edges_dir,
                                           "data_users_mention_edges_df.tsv"),
                              sep='\t')
    for col in ['source', 'dest']:
        mentions_df[col] = mentions_df[col].astype(str)
    # keep only mentions above the minimal threshold
    mentions_df = mentions_df[
        mentions_df["weight"] >= min_mention_threshold].reset_index(drop=True)
    mentions_dict = {}  # users mentioned by the observed user
    mentioned_by_dict = {}  # users mentioning the observed user
    for idx, row in mentions_df.iterrows():
        src = row['source']
        dest = row['dest']
        if src not in mentions_dict.keys():
            mentions_dict[src] = []
        if dest not in mentioned_by_dict.keys():
            mentioned_by_dict[dest] = []
        mentions_dict[src].append(dest)
        mentioned_by_dict[dest].append(src)
    res = pd.DataFrame()
    # SELF_WEIGHT = 0.5
    # FOLLOWERS_WEIGHT = 0.25
    # FOLLOWEES_WEIGHT = 0.25
    for SELF_WEIGHT in np.linspace(0, 1, num=5):
        for FOLLOWERS_WEIGHT in np.linspace(0, 1, num=5):
            if SELF_WEIGHT + FOLLOWERS_WEIGHT >= 1:
                continue
            else:
                FOLLOWEES_WEIGHT = 1.0 - SELF_WEIGHT - FOLLOWERS_WEIGHT
                # logger.info(f"self-weight: {SELF_WEIGHT:.2f}, followers-weight: {FOLLOWERS_WEIGHT:.2f}, followees-weight: {FOLLOWEES_WEIGHT:.2f}")
                user_ids = []
                relational_scores = []
                type_counts = {1: 0, 2: 0, 3: 0, 4: 0}
                for user_id in labeled_users["user_id"].tolist():
                    user_ids.append(user_id)
                    has_followees = True
                    has_followers = True
                    if user_id in mentions_dict.keys():
                        current_followees = mentions_dict[user_id]
                        followees_df = hs_count_per_user.loc[
                            hs_count_per_user["user_id"].isin(current_followees
                                                              ), "hs_count"]
                        if len(followees_df) == 0:
                            has_followees = False
                        else:
                            followees_hs_counts = followees_df.mean()
                    else:
                        has_followees = False
                    if user_id in mentioned_by_dict.keys():
                        current_followers = mentioned_by_dict[user_id]
                        followers_df = hs_count_per_user.loc[
                            hs_count_per_user["user_id"].isin(current_followers
                                                              ), "hs_count"]
                        if len(followers_df) == 0:
                            has_followers = False
                        else:
                            followers_hs_counts = followers_df.mean()
                    else:
                        has_followers = False

                    user_hs_count = int(hs_count_per_user.loc[
                        hs_count_per_user["user_id"] == user_id,
                        "hs_count"].iloc[0])
                    if has_followers and has_followees:
                        type_counts[1] += 1
                        current_score = SELF_WEIGHT * user_hs_count + FOLLOWEES_WEIGHT * followees_hs_counts + FOLLOWERS_WEIGHT * followers_hs_counts
                    elif has_followees and not has_followers:
                        type_counts[2] += 1
                        current_score = SELF_WEIGHT * user_hs_count + FOLLOWEES_WEIGHT * followees_hs_counts
                    elif not has_followees and has_followers:
                        type_counts[3] += 1
                        current_score = SELF_WEIGHT * user_hs_count + FOLLOWERS_WEIGHT * followers_hs_counts
                    else:
                        type_counts[4] += 1
                        current_score = SELF_WEIGHT * user_hs_count

                    relational_scores.append(current_score)
                logger.info(type_counts)
                user2relational_score = pd.DataFrame({
                    "user_id":
                    user_ids,
                    "relational_score":
                    relational_scores
                })

                train_idx = user2relational_score.sample(
                    frac=(1 - test_ratio)).index
                train_user2relational_score = user2relational_score.loc[
                    train_idx]

                max_f1 = 0.0
                best_th = 0
                for threshold in tqdm(range(1, 300)):
                    # to_plot["thresholds"].append(threshold)
                    train_user2relational_score[
                        "y_pred"] = train_user2relational_score[
                            "relational_score"].apply(
                                lambda rs: 1 if rs >= threshold else 0)
                    true_pred = pd.merge(labeled_users,
                                         train_user2relational_score,
                                         on='user_id')
                    y_true = true_pred["label"]
                    y_pred = true_pred["y_pred"]
                    current_f1_score = f1_score(y_true, y_pred)
                    if max_f1 < current_f1_score:
                        max_f1 = current_f1_score
                        best_th = threshold
                logger.info(f"Max f1-score: {max_f1}")
                logger.info(f"Best threshold: {best_th}")
                res = res.append(
                    {
                        "self_weight": SELF_WEIGHT,
                        "followers_weight": FOLLOWERS_WEIGHT,
                        "followees_weight": FOLLOWEES_WEIGHT,
                        "best_f1_score": max_f1,
                        "best_th": best_th
                    },
                    ignore_index=True)
    res.to_csv(os.path.join(output_path,
                            "relational_threshold_grid_search.csv"),
               index=False)
def fixed_threshold_num_of_posts(user2pred: pd.DataFrame,
                                 labeled_users: pd.DataFrame, output_path: str,
                                 test_ratio: float):
    """
    Hard threshold of number of HS predictions per user. Threshold is an integer and above 1.
    :param user2pred:
    :param labeled_users:
    :return:
    """
    logger.info("Executing fixed threshold...")
    output_path = os.path.join(output_path, "hard_threshold")
    create_dir_if_missing(output_path)
    user2pred["user_id"] = user2pred["user_id"].astype(str)
    labeled_users["user_id"] = labeled_users["user_id"].astype(str)
    train_idx = labeled_users.sample(frac=(1 - test_ratio)).index
    train_labeled_users = labeled_users.loc[train_idx]
    test_labeled_users = labeled_users.drop(train_labeled_users.index, axis=0)

    train_user2pred = user2pred[user2pred["user_id"].isin(
        list(train_labeled_users["user_id"]))].reset_index(drop=True)
    test_user2pred = user2pred[user2pred["user_id"].isin(
        list(test_labeled_users["user_id"]))].reset_index(drop=True)

    train_g_df = train_user2pred.groupby('user_id').predictions.agg(
        get_hs_count).reset_index().rename(columns={"predictions": "hs_count"})
    test_g_df = test_user2pred.groupby('user_id').predictions.agg(
        get_hs_count).reset_index().rename(columns={"predictions": "hs_count"})

    to_plot = {
        "thresholds": [],
        "f-scores": [],
        "precisions": [],
        "recalls": [],
        "accuracies": []
    }
    max_f1 = 0.0
    best_th = 0
    for threshold in tqdm(range(1, 300)):
        to_plot["thresholds"].append(threshold)
        train_g_df["y_pred"] = train_g_df["hs_count"].apply(
            lambda h_count: 1 if h_count >= threshold else 0)

        true_pred = pd.merge(train_labeled_users, train_g_df, on='user_id')
        y_true = true_pred["label"]
        y_pred = true_pred["y_pred"]
        current_f1_score = f1_score(y_true, y_pred)
        if max_f1 < current_f1_score:
            max_f1 = current_f1_score
            best_th = threshold
        to_plot["f-scores"].append(current_f1_score)
        to_plot["precisions"].append(precision_score(y_true, y_pred))
        to_plot["recalls"].append(recall_score(y_true, y_pred))
        to_plot["accuracies"].append(accuracy_score(y_true, y_pred))
    plt.figure()
    sns.set(rc={'figure.figsize': (6, 6)}, font_scale=1.7)

    for score_ in ["f-score", "precision", "recall", "accuracy"]:
        current_score_name = "accuracies" if score_.endswith(
            "y") else f"{score_}s"
        if score_ != "recall":
            sns.lineplot(to_plot["thresholds"],
                         to_plot[current_score_name],
                         label=f"{score_}" if score_ != 'f-score' else
                         f"{score_} (max: {max(to_plot['f-scores']):.3f})")
        else:
            sns.lineplot(to_plot["thresholds"],
                         to_plot[current_score_name],
                         label=f"{score_}")
    plt.title("Fixed threshold")
    plt.xlabel('Threshold')
    plt.ylabel('Measurement score')
    plt.savefig(os.path.join(output_path, "hard_threshold_plot.png"))
    pd.DataFrame(to_plot).to_csv(os.path.join(output_path,
                                              "hard_threshold.csv"),
                                 index=False)
    logger.info(f"Max f1-score: {max_f1}")
    logger.info(f"Best threshold: {best_th}")
    # evaluate on test
    test_g_df["y_pred"] = test_g_df["hs_count"].apply(
        lambda h_count: 1 if h_count >= best_th else 0)
    true_pred = pd.merge(test_labeled_users, test_g_df, on='user_id')
    y_true = true_pred["label"]
    y_pred = true_pred["y_pred"]
    with open(os.path.join(output_path, "evaluation.txt"), "w") as fout:
        fout.write(f"F1-score: {f1_score(y_true, y_pred):.3f}\n")
        fout.write(f"Precision: {precision_score(y_true, y_pred):.3f}\n")
        fout.write(f"Recall: {recall_score(y_true, y_pred):.3f}\n")
        fout.write(f"Accuracy: {accuracy_score(y_true, y_pred):.3f}\n")
        fout.write(
            f"Balanced Accuracy: {balanced_accuracy_score(y_true, y_pred):.3f}"
        )
예제 #4
0
def run_ulm_experiment():
    """
    Main function to run the ULM
    :return:
    """
    trained_data = user_level_execution_config["trained_data"]
    inference_data = user_level_execution_config["inference_data"]
    model = post_level_execution_config["kwargs"]["model_name"]
    model_path = f"detection/outputs/{trained_data}/{model}/"
    user_level_path = os.path.join(model_path, "user_level")
    create_dir_if_missing(user_level_path)
    if trained_data != inference_data:
        inference_path = os.path.join(user_level_path, inference_data)
    else:
        inference_path = user_level_path
    create_dir_if_missing(inference_path)
    if 'split_by_posts' in os.listdir(inference_path):
        all_posts_probs_df_path = os.path.join(inference_path,
                                               "split_by_posts", "no_text")
    else:
        all_posts_probs_df_path = os.path.join(inference_path,
                                               "user2pred.parquet")

    if not os.path.exists(
            all_posts_probs_df_path
    ):  # for the first time running this data - predict all posts for all users.
        predict_all_users(trained_data, inference_data, inference_path)

    io_path = os.path.join(inference_path, "io")
    create_dir_if_missing(io_path)

    data_conf = user_level_conf[inference_data]
    data_path = data_conf["data_path"]
    user_column = data_conf["user_unique_column"]
    file_ending = data_path.split(".")[-1]
    if file_ending == 'csv':
        sep = ','
    elif file_ending == 'tsv':
        sep = '\t'
    else:
        raise ValueError(f"wrong ending for file {data_path}")
    user_df = pd.read_csv(data_path, sep=sep)
    user_df[user_column] = user_df[user_column].astype(str)

    user_df.to_csv(os.path.join(io_path, "labeled_users_df.tsv"),
                   sep='\t',
                   index=False)
    if os.path.exists(os.path.join(io_path, "inputs.pkl")):
        logger.info(
            f"Inputs-outputs already exists for dataset {inference_data}. Loading them from {io_path}..."
        )
        inputs = pickle.load(open(os.path.join(io_path, "inputs.pkl"), "rb"))
        output = pickle.load(open(os.path.join(io_path, "outputs.pkl"), "rb"))
    else:
        inputs, output = prepare_inputs_outputs(inference_data,
                                                all_posts_probs_df_path,
                                                output_path=user_level_path,
                                                only_inference=False)

    features_to_use = ["self", "followings", "followers", "network"
                       ]  # "self", "followings", "followers", "network"
    test_size = 0.2
    normalize_features = True
    PREDICT_ALL_DATA_USERS = False
    if not PREDICT_ALL_DATA_USERS:
        result = pd.DataFrame(columns=features_to_use + ["model"] + [
            metric.__name__ for metric in
            [f1_score, accuracy_score, recall_score, precision_score, auc]
        ])
        # logger.info(run_user_model(inputs, output, features_to_use=["self", "followings"], output_path=user_level_path,
        #                model_type="nn", normalize_features=True))

        for model_type in ["nn"]:  # ,"lr", "catboost", "lightgbm", "xgboost",
            logger.info(f"Executing {model_type} model...")
            logger.info(model_type)
            for r in range(1, len(features_to_use) + 1):
                for fc in combinations(features_to_use, r):
                    res_row = run_user_model(
                        inputs,
                        output,
                        features_to_use=fc,
                        output_path=user_level_path,
                        model_type=model_type,
                        normalize_features=normalize_features,
                        test_size=test_size)
                    result = result.append(res_row, ignore_index=True)

        result.to_csv(os.path.join(
            user_level_path,
            f"user_level_results__{int(test_size * 100)}_test.tsv"),
                      sep='\t',
                      index=False)
    else:
        if not os.path.exists(
                os.path.join(user_level_path, "best_user_model.model")
        ):  # best model doesn't exist - search for it

            result = pd.DataFrame(columns=features_to_use + ["model"] + [
                metric.__name__ for metric in
                [f1_score, accuracy_score, recall_score, precision_score, auc]
            ])
            # logger.info(run_user_model(inputs, output, features_to_use=["self", "followings"], output_path=user_level_path,
            #                model_type="nn", normalize_features=True))

            for model_type in ["lr", "catboost", "lightgbm", "xgboost",
                               "nn"]:  # ,
                logger.info(f"Executing {model_type} model...")
                logger.info(model_type)
                for r in range(1, len(features_to_use) + 1):
                    for fc in combinations(features_to_use, r):
                        res_row = run_user_model(
                            inputs,
                            output,
                            features_to_use=fc,
                            output_path=user_level_path,
                            model_type=model_type,
                            normalize_features=normalize_features,
                            test_size=test_size)
                        result = result.append(res_row, ignore_index=True)

            result.to_csv(os.path.join(
                user_level_path,
                f"user_level_results__{int(test_size*100)}_test_power_transformed.tsv"
            ),
                          sep='\t',
                          index=False)
            sorted_f1_results = result.sort_values(
                'f1_score', ascending=False).reset_index(drop=True)
            best_row = sorted_f1_results.iloc[0]
            best_model = best_row["model"]
            best_features = features_to_use.copy()
            if not best_row["self"]:
                best_features.remove('self')
            if not best_row["followings"]:
                best_features.remove('followings')
            if not best_row["followers"]:
                best_features.remove('followers')
            if not best_row["network"]:
                best_features.remove('network')
            # fit best performing model on all data and save it.
            run_user_model(inputs,
                           output,
                           features_to_use=best_features,
                           output_path=user_level_path,
                           model_type=best_model,
                           normalize_features=normalize_features,
                           test_size=None)
        else:
            models_results = pd.read_csv(os.path.join(
                user_level_path,
                f"user_level_results__{int(test_size*100)}_test_power_transformed.tsv"
            ),
                                         sep='\t')
            sorted_f1_results = models_results.sort_values(
                'f1_score', ascending=False).reset_index(drop=True)
            best_row = sorted_f1_results.iloc[0]
            best_model = best_row["model"]
            best_features = features_to_use.copy()
            if not best_row["self"]:
                best_features.remove('self')
            if not best_row["followings"]:
                best_features.remove('followings')
            if not best_row["followers"]:
                best_features.remove('followers')
            if not best_row["network"]:
                best_features.remove('network')
        if os.path.exists(os.path.join(io_path, "only_inputs.pkl")):
            logger.info(
                f"only_inputs already exists for dataset {inference_data}. Loading them from {io_path}..."
            )
            only_inputs = pickle.load(
                open(os.path.join(io_path, "only_inputs.pkl"), "rb"))
        else:
            only_inputs, _ = prepare_inputs_outputs(
                inference_data,
                all_posts_probs_df_path,
                output_path=user_level_path,
                only_inference=True)

        predict_all_users_labels(only_inputs,
                                 best_features,
                                 model_type=best_model,
                                 model_path=os.path.join(
                                     user_level_path, "best_user_model.model"),
                                 normalize_features=normalize_features,
                                 output_path=user_level_path)
예제 #5
0
def predict_all_users(trained_dataset_name, inference_dataset_name,
                      user_level_path):
    logger.info(
        f"predicting all users for dataset: {inference_dataset_name} using models train on {trained_dataset_name} data"
    )
    logger.info(f"predicting all users for dataset: {inference_dataset_name}")
    post_level_data_conf = post_level_conf[trained_dataset_name]
    labels = post_level_data_conf["labels"]
    labels_interpretation = post_level_data_conf["labels_interpretation"]
    post_level_execution_config["kwargs"]["labels"] = labels
    post_level_execution_config["kwargs"][
        "labels_interpretation"] = labels_interpretation

    # load preprocessor and model
    pt, post_model = load_post_model(**post_level_execution_config)
    # get posts per user
    # if os.path.exists(os.path.join(user_level_path, "X_test.pkl")):
    #     X_test = pickle.load(open(os.path.join(user_level_path, "X_test.pkl"), "rb"))
    # else:
    if not os.path.exists(
            os.path.join(user_level_path, "all_users_tweets.parquet")):
        logger.info(
            f"reading all posts of {inference_dataset_name} dataset...")
        posts_per_user_dict = pickle.load(
            open(
                user_level_conf[inference_dataset_name]["posts_per_user_path"],
                "rb"))
        for k, v in posts_per_user_dict.items():
            posts_per_user_dict[k] = [p.strip() for p in v
                                      if p.strip() != '']  # omit empty posts
        logger.info(
            f"finished reading all posts of {inference_dataset_name} dataset")

        # i = 0
        if len(posts_per_user_dict[list(
                posts_per_user_dict.keys())[0]][0]) == 2:
            full_df = pd.DataFrame(columns=['user_id', 'post_id', 'text'])
            for user_id, user_posts in tqdm(posts_per_user_dict.items()):
                user_id = str(user_id)
                current_user_df = pd.DataFrame({
                    'user_id': [user_id for _ in range(len(user_posts))],
                    'post_id':
                    [user_post_tup[0] for user_post_tup in user_posts],
                    'text': [
                        user_post_tup[1].strip()
                        for user_post_tup in user_posts
                        if user_post_tup[1].strip() != ''
                    ],
                })
                full_df = full_df.append(current_user_df, ignore_index=True)
        else:
            full_df = pd.DataFrame(columns=['user_id', 'text'])
            for user_id, user_posts in tqdm(posts_per_user_dict.items()):
                user_id = str(user_id)
                user_posts = [p.strip() for p in user_posts if p.strip() != '']
                current_user_df = pd.DataFrame({
                    'user_id': [user_id for _ in range(len(user_posts))],
                    'text':
                    user_posts,
                })
                full_df = full_df.append(current_user_df, ignore_index=True)
        full_df.to_parquet(os.path.join(user_level_path,
                                        "all_users_tweets.parquet"),
                           index=False)
    else:  # tweets per user df already exists
        logger.info("reading all_users_tweets.parquet file...")

        full_df = pd.read_parquet(
            os.path.join(user_level_path, "all_users_tweets.parquet"))
        logger.info(f"full_df shape: {full_df.shape}")
        full_df = full_df[full_df["text"].apply(
            lambda t: t.strip() != "")].reset_index(drop=True)
        # first_10000_users = list(full_df["user_id"].unique())[:10000]
        # full_df = full_df[full_df["user_id"].isin(first_10000_users)]
        logger.info(
            f"full_df shape after removing empty posts: {full_df.shape}")
        logger.info("file read.")

    # SPLITTING TO CHUNKS BY TWEETS
    chunk_size = 1000000
    logger.info(f"preprocessing in chunks of {chunk_size}...")
    logger.info(f"Length of full_df: {len(full_df)}")
    for user_range in range(0, len(full_df), chunk_size):
        current_full_df = full_df.loc[user_range:user_range + chunk_size - 1]
        current_X = current_full_df["text"]

        _, X_test, _, _, _, _ = pt.full_preprocessing(current_X,
                                                      None,
                                                      mode='test')
        logger.info(
            f"predicting users tweets; indices: {user_range} to {user_range+chunk_size-1}..."
        )
        y_proba = post_model.predict_proba(X_test)
        current_full_df.loc[:, 'predictions'] = y_proba

        create_dir_if_missing(os.path.join(user_level_path, "split_by_posts"))
        create_dir_if_missing(
            os.path.join(user_level_path, "split_by_posts", "no_text"))
        create_dir_if_missing(
            os.path.join(user_level_path, "split_by_posts", "with_text"))
        logger.info(f"saving predictions to {user_level_path}")

        current_full_df[['user_id', 'predictions']].to_parquet(os.path.join(
            user_level_path, "split_by_posts", "no_text",
            f"user2pred_min_idx_{user_range}_max_idx_{user_range+chunk_size-1}.parquet"
        ),
                                                               index=False)

        current_full_df.to_parquet(os.path.join(
            user_level_path, "split_by_posts", "with_text",
            f"user2pred_with_text_min_idx_{user_range}_max_idx_{user_range+chunk_size-1}.parquet"
        ),
                                   index=False)
예제 #6
0
def prepare_inputs_outputs(dataset_name,
                           all_posts_probs_df_path,
                           output_path,
                           only_inference=False):
    """
    predict whether a user is a hatemonger or not by his tweets and his friends' (mentioned by the user and mentioning
     the user) tweets.
    :return:
    """
    logger.info(f"Preparing inputs outputs for {dataset_name} dataset...")
    # user level config
    data_conf = user_level_conf[dataset_name]
    if "data_path" in data_conf.keys():
        data_path = data_conf["data_path"]
        user_column = data_conf["user_unique_column"]
        label_column = data_conf["label_column"]
        labels = data_conf["labels"]
        labels_interpretation = data_conf["labels_interpretation"]
        if only_inference:  # get all users in the network for inference
            user_df = pd.read_csv(
                f"hate_networks/{dataset_name.split('_')[0]}_networks/tsv_data/users.tsv",
                sep="\t")
            if dataset_name == 'gab':
                user_df = user_df.sample(
                    n=10000)  # too many users in gab net... sample 10K
            user_df[user_column] = user_df[user_column].astype(str)
        else:
            file_ending = data_path.split(".")[-1]
            if file_ending == 'csv':
                sep = ','
            elif file_ending == 'tsv':
                sep = '\t'
            else:
                raise ValueError(f"wrong ending for file {data_path}")
            user_df = pd.read_csv(data_path, sep=sep)
            user_df[user_column] = user_df[user_column].astype(str)

    network_dir = f"hate_networks/outputs/{dataset_name.split('_')[0]}_networks/network_data/"
    # load centralities features for each user
    centralities_df = pd.read_csv(
        os.path.join(
            network_dir, "features",
            "centralities_mention_edges_filtered_singletons_filtered.tsv"),
        sep='\t')  # centralities_mention_all_edges_all_nodes.tsv
    centralities_df[user_column] = centralities_df[user_column].astype(str)
    centrality_measurements = [
        col for col in centralities_df.columns if col != 'user_id'
    ]
    min_mention_threshold = 3
    following_fn = data_conf["following_fn"]
    mentions_dict, mentioned_by_dict = get_followers_followees_dicts(
        network_dir, following_fn, min_mention_threshold)

    # take mean of lengths as maximum length of feature array
    # consider average of lengths only from labeled users!
    labeled_mentions_dict = {
        key: val
        for key, val in mentions_dict.items()
        if key in user_df.user_id.tolist()
    }
    labeled_mentioned_by_dict = {
        key: val
        for key, val in mentioned_by_dict.items()
        if key in user_df.user_id.tolist()
    }

    max_followings_num = int(
        np.mean([len(x) for x in labeled_mentions_dict.values()]))
    max_followers_num = int(
        np.mean([len(x) for x in labeled_mentioned_by_dict.values()]))
    if all_posts_probs_df_path.endswith("parquet") or os.path.isdir(
            all_posts_probs_df_path):
        all_users_probs = pd.read_parquet(all_posts_probs_df_path)
    else:
        all_users_probs = pd.read_csv(all_posts_probs_df_path,
                                      sep='\t',
                                      engine='python')
    labeled_users_predictions = all_users_probs[
        all_users_probs['user_id'].isin(user_df.user_id.tolist())]
    max_user_tweets = int(
        labeled_users_predictions.groupby('user_id').size().reset_index()
        [0].mean())
    self_input = []
    followings_input = []
    followers_input = []
    network_features_input = []
    outputs = []
    all_users_list = list(all_users_probs["user_id"].unique())

    for idx, row in tqdm(user_df.iterrows()):
        user_id = str(row[user_column])

        if user_id not in centralities_df["user_id"].tolist():
            current_network_features = np.zeros(len(centrality_measurements))
        else:
            current_network_features = []
            for cm in centrality_measurements:
                current_network_features.append(
                    centralities_df.loc[centralities_df["user_id"] == user_id,
                                        cm].iloc[0])
        network_features_input.append(np.array(current_network_features))
        if not only_inference:
            outputs.append(row[label_column])
        avg_followings_predictions = []
        avg_followers_predictions = []
        if user_id in mentions_dict.keys():
            followings = mentions_dict[
                user_id]  # users mentioned by the observed user
            followings = [
                followee for followee in followings
                if followee in all_users_list
            ]
            if len(followings) > 0:
                followings_predictions = all_users_probs.loc[
                    all_users_probs["user_id"].isin(followings)]
                avg_followings_predictions = followings_predictions.groupby(
                    'user_id').agg({'predictions': 'mean'})['predictions']

        if user_id in mentioned_by_dict.keys():
            followers = mentioned_by_dict[
                user_id]  # users mentioning the observed user
            followers = [
                follower for follower in followers
                if follower in all_users_list
            ]
            if len(followers) > 0:
                followers_predictions = all_users_probs.loc[
                    all_users_probs["user_id"].isin(followers)]
                avg_followers_predictions = followers_predictions.groupby(
                    'user_id').agg({'predictions': 'mean'})['predictions']

        self_predictions = all_users_probs.loc[all_users_probs["user_id"] ==
                                               user_id, "predictions"]

        # handle followings/followers predictions (average them)
        self_predictions = list(self_predictions)
        avg_followings_predictions = list(avg_followings_predictions)
        avg_followers_predictions = list(avg_followers_predictions)

        # enough when max is really max (not average)
        if len(self_predictions) < max_user_tweets:
            self_predictions.extend([0.0] *
                                    (max_user_tweets - len(self_predictions)))
        if len(avg_followings_predictions) < max_followings_num:
            avg_followings_predictions.extend(
                [0.0] * (max_followings_num - len(avg_followings_predictions)))
        if len(avg_followers_predictions) < max_followers_num:
            avg_followers_predictions.extend(
                [0.0] * (max_followers_num - len(avg_followers_predictions)))
        # when max is an average, we need to also remove some of the predictions for users with more than the avg
        if len(self_predictions) > max_user_tweets:
            self_predictions = self_predictions[:max_user_tweets]
        if len(avg_followings_predictions) > max_followings_num:
            avg_followings_predictions = avg_followings_predictions[:
                                                                    max_followings_num]
        if len(avg_followers_predictions) > max_followers_num:
            avg_followers_predictions = avg_followers_predictions[:
                                                                  max_followers_num]

        self_input.append(np.array(self_predictions))
        followings_input.append(np.array(avg_followings_predictions))
        followers_input.append(np.array(avg_followers_predictions))

    self_input = np.array(self_input, dtype='float32')
    followings_input = np.array(followings_input, dtype='float32')
    followers_input = np.array(followers_input, dtype='float32')
    network_features_input = np.array(network_features_input, dtype='float32')
    # user_model = build_user_model(max_user_tweets, max_followings_num, max_followers_num, len(centrality_measurements))
    all_inputs = [
        self_input, followings_input, followers_input, network_features_input
    ]

    io_path = os.path.join(output_path, "io")
    create_dir_if_missing(io_path)
    if not only_inference:
        outputs = np.array(outputs)
        pickle.dump(all_inputs, open(os.path.join(io_path, "inputs.pkl"),
                                     "wb"))
        pickle.dump(outputs, open(os.path.join(io_path, "outputs.pkl"), "wb"))
        user_df.to_csv(os.path.join(io_path, "labeled_users_df.tsv"),
                       sep='\t',
                       index=False)
    else:
        pickle.dump(all_inputs,
                    open(os.path.join(io_path, "only_inputs.pkl"), "wb"))
        user_df.to_csv(os.path.join(io_path, "all_users_df.tsv"),
                       sep='\t',
                       index=False)
    return all_inputs, outputs
예제 #7
0
def run_user_model(X,
                   y,
                   features_to_use,
                   output_path,
                   model_type="nn",
                   normalize_features=True,
                   test_size=0.2):
    input_features_mapping = {
        "self": 0,
        "followings": 1,
        "followers": 2,
        "network": 3
    }
    relevant_features_idx = [
        v for k, v in input_features_mapping.items() if k in features_to_use
    ]
    res_row = {f: False for f in input_features_mapping.keys()}
    for f in features_to_use:
        res_row[f] = True
    res_row["model"] = model_type
    if test_size is None:
        ## train with all data for best performing configuration.
        X_train, _, y_train, _ = prepare_data_for_modeling(
            X, y, relevant_features_idx, normalize_features, test_size,
            output_path)
    else:
        X_train, X_test, y_train, y_test = prepare_data_for_modeling(
            X, y, relevant_features_idx, normalize_features, test_size,
            output_path)
    if model_type == "nn":
        user_model = build_user_model(X[0].shape[1], X[1].shape[1],
                                      X[2].shape[1], X[3].shape[1],
                                      relevant_features_idx)
        early_stopping = EarlyStopping(monitor='val_loss',
                                       min_delta=0.0001,
                                       patience=10,
                                       verbose=1,
                                       mode='auto',
                                       restore_best_weights=True)
        class_weight = compute_class_weight('balanced', np.unique(y_train),
                                            y_train)
        hist = user_model.fit(
            x=X_train,
            y=y_train,
            batch_size=128,
            epochs=60,
            validation_split=0.2,
            verbose=0,
            callbacks=[],
            class_weight={i: class_weight[i]
                          for i in range(2)})
        if test_size is None:
            user_model.save(os.path.join(output_path, "best_user_model.model"),
                            save_format='tf')
    else:
        X_train = np.hstack(tuple([input for input in X_train]))
        if model_type == "catboost":  # requires concatenation in advance
            user_model = CatBoostClassifier()
            user_model.fit(X_train, y_train, verbose=False)
        elif model_type == "lightgbm":
            user_model = lgb.LGBMClassifier()
            user_model.fit(X_train, y_train, verbose=False)
        elif model_type == "lr":
            user_model = LogisticRegressionCV(cv=5, max_iter=10000)
            user_model.fit(X_train, y_train)
        elif model_type == "xgboost":
            user_model = XGBClassifier()
            user_model.fit(X_train, y_train, verbose=False)
        if test_size is None:
            pickle.dump(
                user_model,
                open(os.path.join(output_path, "best_user_model.model"), "wb"))
    # evaluation phase
    if test_size is not None:
        if model_type == "nn":
            y_score = user_model.predict(X_test)
            fp, tp, th = roc_curve(y_true=y_test, y_score=y_score[:, -1])
            res_row[auc.__name__] = auc(fp, tp)

            y_pred = (y_score > 0.5).astype('int32')
            for metric in [
                    f1_score, accuracy_score, recall_score, precision_score
            ]:
                # logger.info(f"{metric.__name__}: {metric(y_test, y_pred):.2f}")
                res_row[metric.__name__] = metric(y_test, y_pred)

            # test_loss, test_acc, test_f1, test_precision, test_recall = user_model.evaluate(X_test, y_test)
            test_loss, test_f1 = user_model.evaluate(X_test, y_test)
            logger.info(f"test loss: {test_loss}")
            # logger.info(f"test acc: {test_acc}")
            logger.info(f"test f1: {test_f1}")
            # logger.info(f"test precision: {test_precision}")
            # logger.info(f"test recall: {test_recall}")
            ## save the model
            # user_model.save_weights(model_weights_file_path)  # save model's weights
            # self.model.save(model_file_path, save_format='tf')  # save full model
            # pickle.dump(self, open(full_model_file_path, "wb"))

            ## save plots of loss and accuracy during training
            training_output_path = os.path.join(output_path, "training")
            create_dir_if_missing(training_output_path)
            plt.figure()
            plt.title('Loss')
            plt.plot(hist.history['loss'], label='train')
            plt.plot(hist.history['val_loss'], label='validation')
            plt.legend()
            loss_fn = os.path.join(training_output_path, "loss_graph.png")
            plt.savefig(loss_fn)
            # plt.figure()
            # plt.title('Accuracy')
            # plt.plot(hist.history['accuracy'], label='train')
            # plt.plot(hist.history['val_accuracy'], label='validation')
            # plt.legend()
            # acc_fn = os.path.join(training_output_path, "acc_graph.png")
            # plt.savefig(acc_fn)
        else:
            X_test = np.hstack(tuple([input for input in X_test]))
            y_score = user_model.predict_proba(X_test)[:, -1]
            fp, tp, th = roc_curve(y_true=y_test, y_score=y_score)
            res_row[auc.__name__] = auc(fp, tp)
            y_pred = user_model.predict(X_test)
            for metric in [
                    f1_score, accuracy_score, recall_score, precision_score
            ]:
                # logger.info(f"{metric.__name__}: {metric(y_test, y_pred):.2f}")
                res_row[metric.__name__] = metric(y_test, y_pred)
    return res_row
예제 #8
0
    def fit(self, X_train, y_train):
        super().fit(X_train, y_train)
        train_output_path = self.paths['train_output']
        model_output_path = self.paths['model_output']

        create_dir_if_missing(train_output_path)
        create_dir_if_missing(model_output_path)

        log_dir = os.path.join(
            train_output_path, 'logs',
            datetime.datetime.now().strftime("%Y-%m-%d%--H:%M:%s"))
        tensorboard_callback = TensorBoard(log_dir=log_dir)

        epochs = self.kwargs['epochs']
        validation_split = self.kwargs['validation_split']
        model_weights_file_path = os.path.join(model_output_path,
                                               "weights_best.h5")
        # model_file_path = os.path.join(model_output_path, "model.h5")
        # full_model_file_path = os.path.join(model_output_path, "full_model.pkl")
        checkpoint = ModelCheckpoint(model_weights_file_path,
                                     monitor='val_loss',
                                     verbose=2,
                                     save_best_only=True,
                                     mode='min')

        if self.name == 'BertAttentionLSTM':
            reduce_lr = self.create_learning_rate_scheduler(
                max_learn_rate=1e-5,
                end_learn_rate=1e-7,
                warmup_epoch_count=20,
                total_epoch_count=epochs)
        else:
            reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                          factor=0.5,
                                          patience=3,
                                          min_lr=0.0001,
                                          verbose=2)

        early_stopping = EarlyStopping(monitor='val_loss',
                                       min_delta=0.0001,
                                       patience=5,
                                       verbose=2,
                                       mode='auto',
                                       restore_best_weights=True)
        callbacks = [
            reduce_lr, early_stopping
        ]  # not using checkpoint since it's saving the model fully

        # class weights (handling imabalanced data)
        if len(y_train.shape) > 1:
            y_train_for_class_weight = y_train[
                y_train == 1].stack().reset_index().drop(
                    0, 1).set_index('level_0').rename(
                        columns={"level_1": "label"})["label"]
        else:
            y_train_for_class_weight = y_train.copy()
        class_weights = compute_class_weight(
            "balanced", np.unique(y_train_for_class_weight),
            list(y_train_for_class_weight))
        train_class_weights = dict(
            zip(np.unique(y_train_for_class_weight), class_weights))
        print(f"Using GPU: {tf.config.list_physical_devices('GPU')}")

        # fit the model
        # with tf.device("/device:GPU:0"):
        hist = self.model.fit(X_train,
                              y_train,
                              batch_size=32,
                              epochs=epochs,
                              validation_split=validation_split,
                              class_weight=train_class_weights,
                              callbacks=callbacks)
        # save the model
        self.model.save_weights(
            model_weights_file_path)  # save model's weights
        # self.model.save(model_file_path, save_format='tf')  # save full model
        # pickle.dump(self, open(full_model_file_path, "wb"))

        # save plots of loss and accuracy during training
        hist_path = os.path.join(train_output_path, "history.pkl")
        with open(hist_path, "wb") as fout:
            pickle.dump(hist.history, fout)

        plt.figure()
        plt.title('Loss per epoch')
        plt.plot(hist.history['loss'], label='train')
        plt.plot(hist.history['val_loss'], label='validation')
        plt.legend()
        loss_fn = os.path.join(train_output_path, "loss_graph.png")
        plt.savefig(loss_fn)
        if 'accuracy' in hist.history.keys():

            plt.figure()
            plt.title('Accuracy per epoch')
            plt.plot(hist.history['accuracy'], label='train')
            plt.plot(hist.history['val_accuracy'], label='validation')
            plt.legend()
            acc_fn = os.path.join(train_output_path, "acc_graph.png")
            plt.savefig(acc_fn)
        if 'f1_m' in hist.history.keys():
            plt.figure()
            plt.title('F1-score per epoch')
            plt.plot(hist.history['f1_m'], label='train')
            plt.plot(hist.history['val_f1_m'], label='validation')
            plt.legend()
            f1_fn = os.path.join(train_output_path, "f1_graph.png")
            plt.savefig(f1_fn)
        plt.figure()
        plt.title('Learning rate per epoch')
        plt.plot(hist.history['lr'], label='lr')
        plt.legend()
        lr_fn = os.path.join(train_output_path, "lr_graph.png")
        plt.savefig(lr_fn)