def perform_clustering(seed, m_data, labels, n_clusters): # Singleview spherical kmeans clustering # Cluster each view separately s_kmeans = SphericalKMeans(n_clusters=n_clusters, random_state=seed, n_init=100) s_clusters_v1 = s_kmeans.fit_predict(m_data[0]) s_clusters_v2 = s_kmeans.fit_predict(m_data[1]) # Concatenate the multiple views into a single view s_data = np.hstack(m_data) s_clusters = s_kmeans.fit_predict(s_data) # Compute nmi between true class labels and singleview cluster labels s_nmi_v1 = nmi_score(labels, s_clusters_v1) s_nmi_v2 = nmi_score(labels, s_clusters_v2) s_nmi = nmi_score(labels, s_clusters) print('Singleview View 1 NMI Score: {0:.3f}\n'.format(s_nmi_v1)) print('Singleview View 2 NMI Score: {0:.3f}\n'.format(s_nmi_v2)) print('Singleview Concatenated NMI Score: {0:.3f}\n'.format(s_nmi)) # Multiview spherical kmeans clustering # Use the MultiviewKMeans instance to cluster the data m_kmeans = MultiviewSphericalKMeans(n_clusters=n_clusters, n_init=100, random_state=seed) m_clusters = m_kmeans.fit_predict(m_data) # Compute nmi between true class labels and multiview cluster labels m_nmi = nmi_score(labels, m_clusters) print('Multiview NMI Score: {0:.3f}\n'.format(m_nmi)) return m_clusters
def Silhouette(X, seguradora): insurance_label = dbm.GetAccountLabel(seguradora) maxx = len(X) if maxx > 11: maxx = 11 range_of_clusters = list(range(2, maxx)) clusters_silhouette = dict() for n_clusters in range_of_clusters: # Initialize the clusterer with n_clusters value #...and a random generator # seed of 10 for reproducibility. clusterer = SKMeans(n_clusters=n_clusters, random_state=0) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation #...of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) clusters_silhouette.update({n_clusters: silhouette_avg}) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) plt.title('Silhueta media de %s' % insurance_label) plt.xlabel('Numero de clusters', fontsize=16) plt.ylabel("Silhueta media", fontsize=16) plt.plot(clusters_silhouette.keys(), clusters_silhouette.values()) plt.savefig("../analytics/%s/%s_silhuette.png" \ % (insurance_label, insurance_label)) plt.close() silhouettes = [v for v in clusters_silhouette.values()] for k, v in clusters_silhouette.iteritems(): if max(silhouettes) == v: return k
meta_file = "maggot_models/data/processed/2019-09-18-v2/BP_metadata.csv" meta_df = pd.read_csv(meta_file, index_col=0) print(meta_df.head()) class_labels = meta_df.loc[nodelist.astype(int), "BP_Class"].values class_labels[class_labels == "LH2N"] = "Other" uni_class, class_counts = np.unique(class_labels, return_counts=True) inds = np.argsort(class_counts)[::-1] uni_class = uni_class[inds] class_counts = class_counts[inds] n_clusters = 12 for k in range(2, n_clusters): skmeans = SphericalKMeans(n_clusters=k, **skmeans_kws) pred_labels = skmeans.fit_predict(latent) pred_labels = relabel(pred_labels) models.append(skmeans) # gridplot( # [adj], inner_hier_labels=pred_labels, hier_label_fontsize=18, sizes=(2, 10) # ) fig, ax = plt.subplots(1, 2, figsize=(30, 18)) heatmap( binarize(adj), inner_hier_labels=pred_labels, # outer_hier_labels=side_labels, hier_label_fontsize=18, ax=ax[0], cbar=False, sort_nodes=True,
''' model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) model.save('/home/sdp/Downloads/Movie/mymodel') ''' new_model = gensim.models.Word2Vec.load('/home/sdp/Downloads/Movie/mymodel') word_vectors = new_model.wv.syn0 num_clusters = 10 # Initalize a k-means object and use it to extract centroids #print(len(word_vectors)) #kmeans_clustering = sphericalKMeans(word_vectors, vectorSize = 20, numberOfVectors = len(word_vectors), numberOfClusters = 10) kmeans_clustering = SphericalKMeans(n_clusters=10) idx = kmeans_clustering.fit(word_vectors) idx2 = kmeans_clustering.fit_predict(word_vectors) ##########################Save the word2vec centroids in a file############################# centroid_file = open("wordCentroids.txt", "w") word_centroids = idx.cluster_centers_ centroid_file.write(str(len(word_centroids)) + "\n") centroid_file.write(str(len(word_centroids[0])) + "\n") for cen in word_centroids: for i in range(len(cen)): centroid_file.write(str(cen[i]) + "\n") centroid_file.close() ############################################################################################ file = open("vecSummary.txt", "w") word_centroid_map = dict(zip(new_model.wv.index2word, idx2)) document_vectors = list()
def run(self, X, n_clusters, distance): cluster = SphericalKMeans(n_clusters) return cluster.fit_predict(X)
def spherical_clustering(X, n_clusters): cluster = SphericalKMeans(n_clusters) return cluster.fit_predict(X)