def calc_corr(week):
    filename = "DataSet/nmf/nmf-for-week-{}.pkl".format(week)

    # NMF-> compute pairwise -> find clusters
    users, values = read_nmf_data(filename)
    data = compute_coor(values)  #return correlation matrix
    return data, users
Exemplo n.º 2
0
def calc_corr(week):
    filename = "DataSet/nmf/nmf-for-week-{}.pkl".format(week)

    # NMF-> compute pairwise -> find clusters
    users, values = read_nmf_data(filename)
    data = compute_coor(values)  #return correlation matrix

    users_dict = {}
    for idx, u in enumerate(users):
        users_dict[u] = idx
    return data, users, users_dict
from pyvis.network import Network
import pandas as pd
from pairwise_corr import read_nmf_data

#ref: https://pyvis.readthedocs.io/en/latest/tutorial.html

# #dummy data
# users=["u1","u2","u3","u4"]
# coor_matrix=[[1,1,0,0],
#       [1,1,0,0],
#       [0,0,1,1],
#       [0,0,1,1]]

# real data
filename = "/home/smollfish/Desktop/CSE573_Twitter_bot_detection/DataSet_filtered/nmf/nmf-for-week-1.pkl"
users, values = read_nmf_data(filename)
df = pd.DataFrame(values).transpose()
coor_matrix = df.corr(method='pearson')
coor_matrix[coor_matrix < .995] = 0  # filter

net = Network()
net.add_nodes(users, label=users)

for i in range(0, len(coor_matrix)):
    for j in range(0,  len(coor_matrix)):
        if (j <= i):
            continue
        else:
            if(coor_matrix[i][j]!=0):
                net.add_edge(users[i], users[j], label=coor_matrix[i][j])
net.toggle_physics(True)
Exemplo n.º 4
0
def run_kmeans(filepath, ith):
    print("Reading...", filepath)
    users, values = read_nmf_data(filepath)

    # optimal_k = find_optimal_k(values)
    # n_cluster = optimal_k.elbow
    optimal_k = best_k_precomputed[ith]

    # cluster data
    n_cluster = optimal_k
    kmeans = KMeans(n_clusters=n_cluster, random_state=0).fit(values)

    # find the users in each cluster + count the data points in each cluster
    clusters_of_users = [[] for k in range(n_cluster)]
    clusters_of_values = [[] for k in range(n_cluster)]  # NMF data corresponding to clusters_of_users
    for i in range(len(kmeans.labels_)):
        clusters_of_users[kmeans.labels_[i]].append((users[i]))
        clusters_of_values[kmeans.labels_[i]].append(values[i])
    print("Number of users in each cluster k={:}:".format(n_cluster),
          [len(clusters_of_users[k]) for k in range(n_cluster)])

    # find bots in each cluster
    bots_in_cluster = find_bots(clusters_of_users, users)
    print("Number of Russian bots in each cluster k={:}:".format(n_cluster),
          [len(bots_in_cluster[k]) for k in range(n_cluster)])

    # compute pairwise correlation of clusters
    avg_corr_of_clusters = []
    for cluster in clusters_of_values:
        avg_corr = compute_coor_for_cluster(cluster)
        avg_corr_of_clusters.append(avg_corr)
    [(print("Average Correlation of Cluster {:} : {:}".format(k, round(avg_corr_of_clusters[k], 2)))) for k in
     range(n_cluster)]

    # delete clusters where average correlation < .995
    for ix in range(len(avg_corr_of_clusters)):
        if (avg_corr_of_clusters[ix] < 0.90):
            clusters_of_users[ix] = []
            clusters_of_values[ix] = []

    # find bots in each cluster after removing low correlation clusters
    bots_in_cluster = find_bots(clusters_of_users, users)
    print("Number of Russian bots in each cluster after removing low correlation clusters\n",
          [len(bots_in_cluster[k]) for k in range(n_cluster)])

    # calculate precision of each cluster
    # precision 1: russian bots in each cluster/ total users in cluster
    precision_of_clusters = []
    for k in range(n_cluster):
        if len(clusters_of_users[k]) != 0:
            precision_of_clusters.append(len(bots_in_cluster[k]) / len(clusters_of_users[k]))
    if (len(precision_of_clusters) != 0):
        average_precision = sum(precision_of_clusters) / len(precision_of_clusters)
    else:
        average_precision = 0

    print("Average Cluster Precision:", average_precision)

    # precision 2: all detected russian bots/total russian bots in this week
    botnames = pd.read_csv("/home/smollfish/Desktop/CSE573_Twitter_bot_detection/DataSet/botnames.csv")
    week_bots = set(botnames["BotName"].tolist()).intersection(set(users))
    calculated_bots = 0
    for cluster in bots_in_cluster:
        calculated_bots = calculated_bots + len(cluster)

    print(
        "Total Russian bots detected: {:}, Total Russian bots in the week: {:}".format(calculated_bots, len(week_bots)))
    print("Average Accuracy: ", calculated_bots / len(week_bots))
    print()
def sc(filepath):
    print("Reading...", filepath)
    users, values = read_nmf_data(filepath)
    coor_matrix = compute_coor_for_specClust(values)

    # find best k
    nb_clusters, eigenvalues, eigenvectors = eigenDecomposition(
        coor_matrix.values)
    n_cluster = min(nb_clusters)

    #cluster
    clustering = SpectralClustering(n_clusters=n_cluster,
                                    affinity="precomputed",
                                    assign_labels="discretize",
                                    random_state=0).fit(coor_matrix)

    # find the users in each cluster + count the data points in each cluster
    clusters_of_users = [[] for k in range(n_cluster)]
    clusters_of_values = [[] for k in range(n_cluster)
                          ]  # NMF data corresponding to clusters_of_users
    for i in range(len(clustering.labels_)):
        clusters_of_users[clustering.labels_[i]].append(users[i])
        clusters_of_values[clustering.labels_[i]].append(values[i])
    print("Number of users in each cluster k={:}:".format(n_cluster),
          [len(clusters_of_users[k]) for k in range(n_cluster)])

    # find bots in each cluster
    bots_in_cluster = find_bots(clusters_of_users, users)
    print("Number of Russian bots in each cluster k={:}:".format(n_cluster),
          [len(bots_in_cluster[k]) for k in range(n_cluster)])

    # compute pairwise correlation of clusters
    avg_corr_of_clusters = []
    for cluster in clusters_of_values:
        avg_corr = compute_coor_for_cluster(cluster)
        avg_corr_of_clusters.append(avg_corr)
    [(print("Average Correlation of Cluster {:} : {:}".format(
        k, round(avg_corr_of_clusters[k], 2)))) for k in range(n_cluster)]

    # delete clusters where average correlation < .995
    for ix in range(len(avg_corr_of_clusters)):
        if (avg_corr_of_clusters[ix] < 0.90):
            clusters_of_users[ix] = []
            clusters_of_values[ix] = []

    # find bots in each cluster after removing low correlation clusters
    bots_in_cluster = find_bots(clusters_of_users, users)
    print(
        "Number of Russian bots in each cluster after removing low correlation clusters\n",
        [len(bots_in_cluster[k]) for k in range(n_cluster)])

    # calculate precision of each cluster
    # precision 1: russian bots in each cluster/ total users in cluster
    precision_of_clusters = []
    for k in range(n_cluster):
        if len(clusters_of_users[k]) != 0:
            precision_of_clusters.append(
                len(bots_in_cluster[k]) / len(clusters_of_users[k]))
    if (len(precision_of_clusters) != 0):
        average_precision = sum(precision_of_clusters) / len(
            precision_of_clusters)
    else:
        average_precision = 0

    print("Average Cluster Precision:", average_precision)

    # precision 2: all detected russian bots/total russian bots in this week
    botnames = pd.read_csv(
        "/home/smollfish/Desktop/CSE573_Twitter_bot_detection/DataSet/botnames.csv"
    )
    week_bots = set(botnames["BotName"].tolist()).intersection(set(users))
    calculated_bots = 0
    for cluster in bots_in_cluster:
        calculated_bots = calculated_bots + len(cluster)

    print(
        "Total Russian bots detected: {:}, Total Russian bots in the week: {:}"
        .format(calculated_bots, len(week_bots)))
    print("Average Russian Bot Precision: ", calculated_bots / len(week_bots))
    print()