def visualize_results(save_plot=False): """ visualize the embeddings of multiple models in a scatter plot """ fig, axs = plt.subplots(6, 10) fig.set_figheight(16) fig.set_figwidth(16) embedding = ut.load_numpy_file(ut.embedding_path + "gae_first_embedding.npy") visualise_single_embedding(embedding, "gae_first", 0, axs) embedding = ut.load_numpy_file(ut.embedding_path + "gae_concat_embedding.npy") visualise_single_embedding(embedding, "gae_concat", 1, axs) embedding = ut.load_numpy_file(ut.embedding_path + "gae_mixed_embedding.npy") visualise_single_embedding(embedding, "gae_mixed", 2, axs) embedding = ut.load_numpy_file(ut.embedding_path + "gae_l1_sum_embedding.npy") visualise_single_embedding(embedding, "gae_l1_sum", 3, axs) embedding = ut.load_numpy_file(ut.embedding_path + "matrix_factorization_embedding.npy") visualise_single_embedding(embedding, "MF", 4, axs) for ax in axs.flat: ax.label_outer() plt.tight_layout() if save_plot: plt.savefig('embedding.png', dpi=200) plt.show()
def cluster_embeddings(embedding_model_name): """ Cluster the embeddings of a model using K-means :param embedding_model_name: the name of the model that generated the embeddings """ x = ut.load_numpy_file(ut.embedding_path + embedding_model_name + "_embedding.npy") y = ut.node_labels clusters = KMeans(n_clusters=ut.number_classes).fit(x) predicted = clusters.labels_ arindex = sk.adjusted_rand_score(y, predicted) clustering_accuracy = score_clustering_accuracy(y, predicted) nmi = sk.normalized_mutual_info_score(y, predicted) add_score(embedding_model_name, 'kmeans-acc', clustering_accuracy) add_score(embedding_model_name, 'kmeans-nmi', nmi) add_score(embedding_model_name, 'kmeans-ari', arindex) c, num_clust, predicted = FINCH(x, req_clust=7, verbose=False) arindex = sk.adjusted_rand_score(y, predicted) clustering_accuracy = score_clustering_accuracy(y, predicted) nmi = sk.normalized_mutual_info_score(y, predicted) add_score(embedding_model_name, 'finch-acc', clustering_accuracy) add_score(embedding_model_name, 'finch-nmi', nmi) add_score(embedding_model_name, 'finch-ari', arindex)
def visualize_Large(model_name, feature): """ visualize one large plot for a model and color it according to certain feature labels :param model_name: :param feature: """ embedding = ut.load_numpy_file(ut.embedding_path + model_name + "_embedding.npy") tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(embedding) x = tsne_results[:, 0] y = tsne_results[:, 1] area = np.pi * 3 dot_colors = [ "blue", "red", "orange", "green", "yellow", "cyan", "purple", "black", "pink" ] label_types = [feature + "_lables"] plot_secondary_index = 0 for label_type in label_types: if label_type == "node_labels": labels = ut.node_labels else: labels = ut.load_numpy_file(ut.topo_features_labels_path + label_type + ".npy") number_classes = len(set(labels)) xc = [] yc = [] for c in range(0, number_classes): xc.append([]) yc.append([]) for i in range(0, len(ut.graph.nodes)): if labels[i] == c: xc[c].append(x[i]) yc[c].append(y[i]) plt.scatter(xc[c], yc[c], s=area, c=dot_colors[c], alpha=0.5) plot_secondary_index += 1 plt.show()
def generate_features_labels(bins): """ this function divides all the features into classes (bins) :param bins: The number of bins """ degree = ut.load_numpy_file(ut.topo_features_path + "degree.npy") clustering = ut.load_numpy_file(ut.topo_features_path + "clustering.npy") eigenvector_centrality = ut.load_numpy_file(ut.topo_features_path + "eigenvector_centrality.npy") betweenness_centrality = ut.load_numpy_file(ut.topo_features_path + "betweenness_centrality.npy") if not ut.is_directed: triangles = ut.load_numpy_file(ut.topo_features_path + "triangles.npy") bin_feature(degree, "degree", False, bins) bin_feature(clustering, "clustering", False, bins) bin_feature(eigenvector_centrality, "eigenvector_centrality", False, bins) bin_feature(betweenness_centrality, "betweenness_centrality", False, bins) if not ut.is_directed: bin_feature(triangles, "triangles", False, bins)
def save_topo_features(): """ concatenate topological features into one array and save them to a file """ degree = np.array(list(dict(nx.degree(ut.graph)).values())) clustering = np.array(list(nx.clustering(ut.graph).values())) eigenvector_centrality = np.array( list(nx.eigenvector_centrality_numpy(ut.graph).values())) betweenness_centrality = np.array( list(nx.betweenness_centrality(ut.graph).values())) if not ut.is_directed: triangles = np.array(list(nx.triangles(ut.graph).values())) np.save(ut.topo_features_path + "degree.npy", degree) np.save(ut.topo_features_path + "clustering.npy", clustering) np.save(ut.topo_features_path + "eigenvector_centrality.npy", eigenvector_centrality) np.save(ut.topo_features_path + "betweenness_centrality.npy", betweenness_centrality) if not ut.is_directed: np.save(ut.topo_features_path + "triangles.npy", triangles) # concatenate the features (in case to be used as entry) degree = np.reshape(degree, (-1, 1)) clustering = np.reshape(clustering, (-1, 1)) eigenvector_centrality = np.reshape(eigenvector_centrality, (-1, 1)) betweenness_centrality = np.reshape(betweenness_centrality, (-1, 1)) if not ut.is_directed: triangles = np.reshape(triangles, (-1, 1)) topofeatures = np.concatenate([degree, clustering], axis=1) topofeatures = np.concatenate([topofeatures, eigenvector_centrality], axis=1) topofeatures = np.concatenate([topofeatures, betweenness_centrality], axis=1) if not ut.is_directed: topofeatures = np.concatenate([topofeatures, triangles], axis=1) np.save(ut.topo_features_path + "topofeatures.npy", topofeatures) topofeatures = ut.load_numpy_file(ut.topo_features_path + "topofeatures.npy") print("Toplogical Features Calculated:") print(topofeatures) print("-------------")
def calculate_similarity(embedding_model_name): """ calculate the similarity score of the clusters -davies_bouldin_score -calinski_harabasz -silhouette_score :param embedding_model_name: :return: """ x = ut.load_numpy_file(ut.embedding_path + embedding_model_name + "_embedding.npy") y = ut.node_labels davies_bouldin = sk.davies_bouldin_score(x, y) calinski_harabasz = sk.calinski_harabasz_score(x, y) silhouette_score = sk.silhouette_score(x, y) add_score(embedding_model_name, 'davies_bouldin', davies_bouldin) add_score(embedding_model_name, 'calinski_harabasz', calinski_harabasz) add_score(embedding_model_name, 'silhouette_score', silhouette_score)
def classify_embeddings(embedding_model_name): """ classify the embeddings of each embedding model :param embedding_model_name: the name of the embedding model :return: """ # load embedding x = ut.load_numpy_file(ut.embedding_path + embedding_model_name + "_embedding.npy") # load classes y = ut.node_labels max_iter = 200 # logistic regression lr = LogisticRegression(max_iter=max_iter) scores = cross_validate(lr, x, y, scoring=["f1_macro", "f1_micro"]) add_score(embedding_model_name, 'lr_f1_macro', np.mean(scores['test_f1_macro'])) add_score(embedding_model_name, 'lr_f1_micro', np.mean(scores['test_f1_micro'])) # svm linear svm = SVC(kernel='linear', C=1, max_iter=max_iter) scores = cross_validate(svm, x, y, scoring=["f1_macro", "f1_micro"]) add_score(embedding_model_name, 'svm_ln_f1_macro', np.mean(scores['test_f1_macro'])) add_score(embedding_model_name, 'svm_ln_f1_micro', np.mean(scores['test_f1_micro'])) # svm rbf svm = SVC(kernel='rbf', C=1, max_iter=max_iter) scores = cross_validate(svm, x, y, scoring=["f1_macro", "f1_micro"]) add_score(embedding_model_name, 'svm_rbf_f1_macro', np.mean(scores['test_f1_macro'])) add_score(embedding_model_name, 'svm_rbf_f1_micro', np.mean(scores['test_f1_micro'])) # mlp mlp = MLPClassifier(hidden_layer_sizes=2, activation='relu', solver='adam', max_iter=max_iter) scores = cross_validate(mlp, x, y, scoring=["f1_macro", "f1_micro"]) add_score(embedding_model_name, 'mlp_f1_macro', np.mean(scores['test_f1_macro'])) add_score(embedding_model_name, 'mlp_f1_micro', np.mean(scores['test_f1_micro']))
def visualise_single_embedding(embedding, model_name, plot_index, axs): """ Visualize the embeddings of a model :param embedding: the embedding matrix :param plot_index: the index of the sub plot (column) :param axs: axs object of the subplot :return: """ # reduce dimensionality with TSNE to 2 dimension tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(embedding) x = tsne_results[:, 0] y = tsne_results[:, 1] area = np.pi * 3 # dot_colors = list(mcolors.CSS4_COLORS) # random.shuffle(dot_colors) dot_colors = [ "blue", "red", "orange", "green", "yellow", "cyan", "purple", "black", "pink" ] label_types = [ "node_labels", "degree_lables", "betweenness_centrality_lables", "clustering_lables", "eigenvector_centrality_lables", "triangles_lables" ] titles = [ model_name + " " + "label", model_name + " " + "degree", model_name + " " + "betweenness", model_name + " " + "clustering", model_name + " " + "eigenvector", model_name + " " + "triangles" ] plot_secondary_index = 0 # loop through the label types and plot it for label_type in label_types: if label_type == "node_labels": labels = ut.node_labels else: labels = ut.load_numpy_file(ut.topo_features_labels_path + label_type + ".npy") number_classes = len(set(labels)) xc = [] yc = [] for c in range(0, number_classes): xc.append([]) yc.append([]) for i in range(0, len(ut.graph.nodes)): if labels[i] == c: xc[c].append(x[i]) yc[c].append(y[i]) axs[plot_secondary_index, plot_index].scatter(xc[c], yc[c], s=area, c=dot_colors[c], alpha=0.5) axs[plot_secondary_index, plot_index].set_title(titles[plot_secondary_index]) axs[plot_secondary_index, plot_index].axis('off') plot_secondary_index += 1
def classify_features(embedding_model_name, feature_name): """ classify the topological features using the model embeddings :param embedding_model_name: the model that generated the embeddings :param feature_name: the topo feature that we are classifying """ x = ut.load_numpy_file(ut.embedding_path + embedding_model_name + "_embedding.npy") y = ut.load_numpy_file(ut.topo_features_labels_path + feature_name + "_lables.npy") max_iter = 200 # linear regression r = LogisticRegression(max_iter=max_iter) scores = cross_validate(r, x, y, scoring=[ "neg_mean_squared_error", "neg_mean_absolute_error", ]) add_score(embedding_model_name, feature_name, 'r_mse', abs(np.mean(scores['test_neg_mean_squared_error']))) add_score(embedding_model_name, feature_name, 'r_mae', abs(np.mean(scores['test_neg_mean_absolute_error']))) # logistic regression lr = LogisticRegression(max_iter=max_iter) scores = cross_validate(lr, x, y, scoring=["f1_macro", "f1_micro"]) add_score(embedding_model_name, feature_name, 'lr_f1_macro', np.mean(scores['test_f1_macro'])) add_score(embedding_model_name, feature_name, 'lr_f1_micro', np.mean(scores['test_f1_micro'])) # SVM linear svm = SVC(kernel='linear', C=1, max_iter=max_iter) scores = cross_validate(svm, x, y, scoring=["f1_macro", "f1_micro"]) add_score(embedding_model_name, feature_name, 'svm_ln_f1_macro', np.mean(scores['test_f1_macro'])) add_score(embedding_model_name, feature_name, 'svm_ln_f1_micro', np.mean(scores['test_f1_micro'])) # SVM Kernel RBF svm = SVC(kernel='rbf', C=1, max_iter=max_iter) scores = cross_validate(svm, x, y, scoring=["f1_macro", "f1_micro"]) add_score(embedding_model_name, feature_name, 'svm_rbf_f1_macro', np.mean(scores['test_f1_macro'])) add_score(embedding_model_name, feature_name, 'svm_rbf_f1_micro', np.mean(scores['test_f1_micro'])) # MPL 2 Layers mlp = MLPClassifier(hidden_layer_sizes=2, activation='relu', solver='adam', max_iter=max_iter) scores = cross_validate(mlp, x, y, scoring=["f1_macro", "f1_micro"]) add_score(embedding_model_name, feature_name, 'mlp_f1_macro', np.mean(scores['test_f1_macro'])) add_score(embedding_model_name, feature_name, 'mlp_f1_micro', np.mean(scores['test_f1_micro']))