Пример #1
0
def visualize_results(save_plot=False):
    """
    visualize the embeddings of multiple models in a scatter plot
    """
    fig, axs = plt.subplots(6, 10)
    fig.set_figheight(16)
    fig.set_figwidth(16)
    embedding = ut.load_numpy_file(ut.embedding_path +
                                   "gae_first_embedding.npy")
    visualise_single_embedding(embedding, "gae_first", 0, axs)
    embedding = ut.load_numpy_file(ut.embedding_path +
                                   "gae_concat_embedding.npy")
    visualise_single_embedding(embedding, "gae_concat", 1, axs)
    embedding = ut.load_numpy_file(ut.embedding_path +
                                   "gae_mixed_embedding.npy")
    visualise_single_embedding(embedding, "gae_mixed", 2, axs)
    embedding = ut.load_numpy_file(ut.embedding_path +
                                   "gae_l1_sum_embedding.npy")
    visualise_single_embedding(embedding, "gae_l1_sum", 3, axs)
    embedding = ut.load_numpy_file(ut.embedding_path +
                                   "matrix_factorization_embedding.npy")
    visualise_single_embedding(embedding, "MF", 4, axs)

    for ax in axs.flat:
        ax.label_outer()
    plt.tight_layout()
    if save_plot:
        plt.savefig('embedding.png', dpi=200)
    plt.show()
Пример #2
0
def cluster_embeddings(embedding_model_name):
    """
    Cluster the embeddings of a model using K-means
    :param embedding_model_name: the name of the model that generated the embeddings
    """
    x = ut.load_numpy_file(ut.embedding_path + embedding_model_name +
                           "_embedding.npy")
    y = ut.node_labels

    clusters = KMeans(n_clusters=ut.number_classes).fit(x)
    predicted = clusters.labels_

    arindex = sk.adjusted_rand_score(y, predicted)
    clustering_accuracy = score_clustering_accuracy(y, predicted)
    nmi = sk.normalized_mutual_info_score(y, predicted)

    add_score(embedding_model_name, 'kmeans-acc', clustering_accuracy)
    add_score(embedding_model_name, 'kmeans-nmi', nmi)
    add_score(embedding_model_name, 'kmeans-ari', arindex)

    c, num_clust, predicted = FINCH(x, req_clust=7, verbose=False)

    arindex = sk.adjusted_rand_score(y, predicted)
    clustering_accuracy = score_clustering_accuracy(y, predicted)
    nmi = sk.normalized_mutual_info_score(y, predicted)

    add_score(embedding_model_name, 'finch-acc', clustering_accuracy)
    add_score(embedding_model_name, 'finch-nmi', nmi)
    add_score(embedding_model_name, 'finch-ari', arindex)
Пример #3
0
def visualize_Large(model_name, feature):
    """
    visualize one large plot for a model and
    color it according to certain feature labels
    :param model_name:
    :param feature:
    """
    embedding = ut.load_numpy_file(ut.embedding_path + model_name +
                                   "_embedding.npy")
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(embedding)
    x = tsne_results[:, 0]
    y = tsne_results[:, 1]

    area = np.pi * 3
    dot_colors = [
        "blue", "red", "orange", "green", "yellow", "cyan", "purple", "black",
        "pink"
    ]

    label_types = [feature + "_lables"]
    plot_secondary_index = 0
    for label_type in label_types:
        if label_type == "node_labels":
            labels = ut.node_labels
        else:
            labels = ut.load_numpy_file(ut.topo_features_labels_path +
                                        label_type + ".npy")
        number_classes = len(set(labels))

        xc = []
        yc = []
        for c in range(0, number_classes):
            xc.append([])
            yc.append([])
            for i in range(0, len(ut.graph.nodes)):
                if labels[i] == c:
                    xc[c].append(x[i])
                    yc[c].append(y[i])
            plt.scatter(xc[c], yc[c], s=area, c=dot_colors[c], alpha=0.5)
        plot_secondary_index += 1
    plt.show()
Пример #4
0
def generate_features_labels(bins):
    """
    this function divides all the features into classes (bins)
    :param bins: The number of bins
    """
    degree = ut.load_numpy_file(ut.topo_features_path + "degree.npy")
    clustering = ut.load_numpy_file(ut.topo_features_path + "clustering.npy")
    eigenvector_centrality = ut.load_numpy_file(ut.topo_features_path +
                                                "eigenvector_centrality.npy")
    betweenness_centrality = ut.load_numpy_file(ut.topo_features_path +
                                                "betweenness_centrality.npy")
    if not ut.is_directed:
        triangles = ut.load_numpy_file(ut.topo_features_path + "triangles.npy")

    bin_feature(degree, "degree", False, bins)
    bin_feature(clustering, "clustering", False, bins)
    bin_feature(eigenvector_centrality, "eigenvector_centrality", False, bins)
    bin_feature(betweenness_centrality, "betweenness_centrality", False, bins)
    if not ut.is_directed:
        bin_feature(triangles, "triangles", False, bins)
Пример #5
0
def save_topo_features():
    """
    concatenate topological features into one array
    and save them to a file
    """
    degree = np.array(list(dict(nx.degree(ut.graph)).values()))
    clustering = np.array(list(nx.clustering(ut.graph).values()))
    eigenvector_centrality = np.array(
        list(nx.eigenvector_centrality_numpy(ut.graph).values()))
    betweenness_centrality = np.array(
        list(nx.betweenness_centrality(ut.graph).values()))
    if not ut.is_directed:
        triangles = np.array(list(nx.triangles(ut.graph).values()))

    np.save(ut.topo_features_path + "degree.npy", degree)
    np.save(ut.topo_features_path + "clustering.npy", clustering)
    np.save(ut.topo_features_path + "eigenvector_centrality.npy",
            eigenvector_centrality)
    np.save(ut.topo_features_path + "betweenness_centrality.npy",
            betweenness_centrality)
    if not ut.is_directed:
        np.save(ut.topo_features_path + "triangles.npy", triangles)

    # concatenate the features (in case to be used as entry)
    degree = np.reshape(degree, (-1, 1))
    clustering = np.reshape(clustering, (-1, 1))
    eigenvector_centrality = np.reshape(eigenvector_centrality, (-1, 1))
    betweenness_centrality = np.reshape(betweenness_centrality, (-1, 1))
    if not ut.is_directed:
        triangles = np.reshape(triangles, (-1, 1))

    topofeatures = np.concatenate([degree, clustering], axis=1)
    topofeatures = np.concatenate([topofeatures, eigenvector_centrality],
                                  axis=1)
    topofeatures = np.concatenate([topofeatures, betweenness_centrality],
                                  axis=1)
    if not ut.is_directed:
        topofeatures = np.concatenate([topofeatures, triangles], axis=1)

    np.save(ut.topo_features_path + "topofeatures.npy", topofeatures)
    topofeatures = ut.load_numpy_file(ut.topo_features_path +
                                      "topofeatures.npy")

    print("Toplogical Features Calculated:")
    print(topofeatures)
    print("-------------")
Пример #6
0
def calculate_similarity(embedding_model_name):
    """
    calculate the similarity score of the clusters
        -davies_bouldin_score
        -calinski_harabasz
        -silhouette_score
    :param embedding_model_name:
    :return:
    """
    x = ut.load_numpy_file(ut.embedding_path + embedding_model_name +
                           "_embedding.npy")
    y = ut.node_labels

    davies_bouldin = sk.davies_bouldin_score(x, y)
    calinski_harabasz = sk.calinski_harabasz_score(x, y)
    silhouette_score = sk.silhouette_score(x, y)

    add_score(embedding_model_name, 'davies_bouldin', davies_bouldin)
    add_score(embedding_model_name, 'calinski_harabasz', calinski_harabasz)
    add_score(embedding_model_name, 'silhouette_score', silhouette_score)
Пример #7
0
def classify_embeddings(embedding_model_name):
    """
    classify the embeddings of each embedding model
    :param embedding_model_name: the name of the embedding model
    :return:
    """

    # load embedding
    x = ut.load_numpy_file(ut.embedding_path + embedding_model_name + "_embedding.npy")
    # load classes
    y = ut.node_labels
    max_iter = 200

    # logistic regression
    lr = LogisticRegression(max_iter=max_iter)
    scores = cross_validate(lr, x, y, scoring=["f1_macro", "f1_micro"])

    add_score(embedding_model_name, 'lr_f1_macro', np.mean(scores['test_f1_macro']))
    add_score(embedding_model_name, 'lr_f1_micro', np.mean(scores['test_f1_micro']))

    # svm linear
    svm = SVC(kernel='linear', C=1, max_iter=max_iter)
    scores = cross_validate(svm, x, y, scoring=["f1_macro", "f1_micro"])
    add_score(embedding_model_name, 'svm_ln_f1_macro', np.mean(scores['test_f1_macro']))
    add_score(embedding_model_name, 'svm_ln_f1_micro', np.mean(scores['test_f1_micro']))

    # svm rbf
    svm = SVC(kernel='rbf', C=1, max_iter=max_iter)
    scores = cross_validate(svm, x, y, scoring=["f1_macro", "f1_micro"])
    add_score(embedding_model_name, 'svm_rbf_f1_macro', np.mean(scores['test_f1_macro']))
    add_score(embedding_model_name, 'svm_rbf_f1_micro', np.mean(scores['test_f1_micro']))

    # mlp
    mlp = MLPClassifier(hidden_layer_sizes=2, activation='relu', solver='adam', max_iter=max_iter)
    scores = cross_validate(mlp, x, y, scoring=["f1_macro", "f1_micro"])
    add_score(embedding_model_name, 'mlp_f1_macro', np.mean(scores['test_f1_macro']))
    add_score(embedding_model_name, 'mlp_f1_micro', np.mean(scores['test_f1_micro']))
Пример #8
0
def visualise_single_embedding(embedding, model_name, plot_index, axs):
    """
    Visualize the embeddings of a model
    :param embedding: the embedding matrix
    :param plot_index: the index of the sub plot (column)
    :param axs: axs object of the subplot
    :return:
    """

    # reduce dimensionality with TSNE to 2 dimension
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(embedding)
    x = tsne_results[:, 0]
    y = tsne_results[:, 1]

    area = np.pi * 3
    # dot_colors = list(mcolors.CSS4_COLORS)
    # random.shuffle(dot_colors)
    dot_colors = [
        "blue", "red", "orange", "green", "yellow", "cyan", "purple", "black",
        "pink"
    ]

    label_types = [
        "node_labels", "degree_lables", "betweenness_centrality_lables",
        "clustering_lables", "eigenvector_centrality_lables",
        "triangles_lables"
    ]
    titles = [
        model_name + " " + "label", model_name + " " + "degree",
        model_name + " " + "betweenness", model_name + " " + "clustering",
        model_name + " " + "eigenvector", model_name + " " + "triangles"
    ]

    plot_secondary_index = 0
    # loop through the label types and plot it
    for label_type in label_types:
        if label_type == "node_labels":
            labels = ut.node_labels
        else:
            labels = ut.load_numpy_file(ut.topo_features_labels_path +
                                        label_type + ".npy")
        number_classes = len(set(labels))

        xc = []
        yc = []
        for c in range(0, number_classes):
            xc.append([])
            yc.append([])
            for i in range(0, len(ut.graph.nodes)):
                if labels[i] == c:
                    xc[c].append(x[i])
                    yc[c].append(y[i])
            axs[plot_secondary_index, plot_index].scatter(xc[c],
                                                          yc[c],
                                                          s=area,
                                                          c=dot_colors[c],
                                                          alpha=0.5)
            axs[plot_secondary_index,
                plot_index].set_title(titles[plot_secondary_index])
            axs[plot_secondary_index, plot_index].axis('off')
        plot_secondary_index += 1
Пример #9
0
def classify_features(embedding_model_name, feature_name):
    """
    classify the topological features using the model embeddings
    :param embedding_model_name: the model that generated the embeddings
    :param feature_name: the topo feature that we are classifying
    """
    x = ut.load_numpy_file(ut.embedding_path + embedding_model_name +
                           "_embedding.npy")
    y = ut.load_numpy_file(ut.topo_features_labels_path + feature_name +
                           "_lables.npy")

    max_iter = 200

    # linear regression
    r = LogisticRegression(max_iter=max_iter)
    scores = cross_validate(r,
                            x,
                            y,
                            scoring=[
                                "neg_mean_squared_error",
                                "neg_mean_absolute_error",
                            ])
    add_score(embedding_model_name, feature_name, 'r_mse',
              abs(np.mean(scores['test_neg_mean_squared_error'])))
    add_score(embedding_model_name, feature_name, 'r_mae',
              abs(np.mean(scores['test_neg_mean_absolute_error'])))

    # logistic regression
    lr = LogisticRegression(max_iter=max_iter)
    scores = cross_validate(lr, x, y, scoring=["f1_macro", "f1_micro"])
    add_score(embedding_model_name, feature_name, 'lr_f1_macro',
              np.mean(scores['test_f1_macro']))
    add_score(embedding_model_name, feature_name, 'lr_f1_micro',
              np.mean(scores['test_f1_micro']))

    # SVM linear
    svm = SVC(kernel='linear', C=1, max_iter=max_iter)
    scores = cross_validate(svm, x, y, scoring=["f1_macro", "f1_micro"])
    add_score(embedding_model_name, feature_name, 'svm_ln_f1_macro',
              np.mean(scores['test_f1_macro']))
    add_score(embedding_model_name, feature_name, 'svm_ln_f1_micro',
              np.mean(scores['test_f1_micro']))

    # SVM Kernel RBF
    svm = SVC(kernel='rbf', C=1, max_iter=max_iter)
    scores = cross_validate(svm, x, y, scoring=["f1_macro", "f1_micro"])
    add_score(embedding_model_name, feature_name, 'svm_rbf_f1_macro',
              np.mean(scores['test_f1_macro']))
    add_score(embedding_model_name, feature_name, 'svm_rbf_f1_micro',
              np.mean(scores['test_f1_micro']))

    # MPL 2 Layers
    mlp = MLPClassifier(hidden_layer_sizes=2,
                        activation='relu',
                        solver='adam',
                        max_iter=max_iter)
    scores = cross_validate(mlp, x, y, scoring=["f1_macro", "f1_micro"])
    add_score(embedding_model_name, feature_name, 'mlp_f1_macro',
              np.mean(scores['test_f1_macro']))
    add_score(embedding_model_name, feature_name, 'mlp_f1_micro',
              np.mean(scores['test_f1_micro']))