def get_result_clustering_by_ap(self, text_list): cluster = Cluster() Xn, text_vector_list = cluster.get_text_data(text_list) x_sims, max_sim, min_sim, mid_sim = cluster.get_data_sims(Xn) # 两两相似度矩阵 print('x_sims:', x_sims) text_clustering_list = [] if min_sim > max_clusters_sim: for i in range(len(text_vector_list)): text_id = text_vector_list[i].get("text_id") text_clustering_list.append({ "text_id": text_id, "class_num": "0" }) else: ap = AP( damping=apdampling, max_iter=1000, convergence_iter=100, preference=mid_sim, # 相似度达到多少聚为一类 affinity="precomputed").fit(x_sims) # labels:聚类结果 labels = ap.labels_ monitor_set = set() if -1 not in labels: for i in range(len(labels)): monitor_set.add(labels[i]) class_num = str(labels[i]) text_id = text_vector_list[i].get("text_id") text_clustering_list.append({ "text_id": text_id, "class_num": class_num }) return text_clustering_list else: adapt_ap_damping = 0.5 ap = AP(max_iter=1000, convergence_iter=100, preference=mid_sim, affinity="precomputed", damping=adapt_ap_damping).fit(x_sims) for i in range(len(labels)): monitor_set.add(labels[i]) class_num = str(labels[i]) text_id = text_vector_list[i].get("text_id") text_clustering_list.append({ "text_id": text_id, "class_num": class_num }) return text_clustering_list
def make_cluster_map(damping=0.992): test_labels, prediction = pickle.load(open(f_path_pred, 'rb')) prob_conf = np.zeros((121, 121)) for l in range(121): inds = np.squeeze(np.array(np.where(test_labels == l))) class_conf = prediction[inds, :].mean(axis=0) prob_conf[l, :] = class_conf F = prob_conf D = (1 - F) np.fill_diagonal(D, 0) D_p = 0.5 * (D + D.T) clst = AP( damping=damping, # damping determines # of clusters max_iter=500, convergence_iter=15, affinity='euclidean', verbose=False) clst.fit(D_p) print 'Number of cluster:', len(clst.cluster_centers_) membership = np.c_[range(121), clst.labels_] fine_to_coarse = dict(membership) coarse_to_fine = {l: [] for l in clst.labels_} for k, v in fine_to_coarse.items(): coarse_to_fine[v].append(k) pickle.dump(coarse_to_fine, open(os.path.join(curdir, 'coarse_to_fine.p'), 'wb')) pickle.dump(fine_to_coarse, open(os.path.join(curdir, 'fine_to_coarse.p'), 'wb'))
def get_fitted_affinity(self, user_to_rwf, distance): aff_prop = AP(affinity='precomputed') similarity_mtx = np.zeros((len(user_to_rwf), len(user_to_rwf))) if distance == 'cosine': i = 0 for user1 in user_to_rwf: j = 0 for user2 in user_to_rwf: similarity_mtx[i][j] = cosine_sim(user_to_rwf[user1], user_to_rwf[user2]) j += 1 i += 1 else: i = 0 for user1 in user_to_rwf: j = 0 for user2 in user_to_rwf: similarity_mtx[i][j] = word_overlap( user_to_rwf[user1], user_to_rwf[user2]) j += 1 i += 1 median = np.median(similarity_mtx) for i in range(len(user_to_rwf)): similarity_mtx[i][i] = median return aff_prop.fit(similarity_mtx)
def create_stratum(self, column_names, **kwargs): ''' Use affinity propagation to find number of strata for each column. column_names is a list of the covariates to be split into strata and used for classification. This funciton adds a column to the data frame for each column as column_name_strata that gives the strata designation for that variable. The whole data frame is returned. ''' for colname in column_names: X = self.data[colname].reshape(-1, 1) if np.isnan(X).any(): raise ValueError( "There are NaN values in self.data[%s] that the \ clustering algorithm can't handle" % colname) elif np.unique(self.data[colname]).shape[0] <= 2: string_name = colname + '_strata' self.data[string_name] = self.data[colname].astype(int) else: af_model = AP(damping=0.9) strata_groups = af_model.fit(X) #cluster_centers_indices = af.cluster_centers_indices_ #n_clusters_ = len(cluster_centers_indices) string_name = colname + '_strata' self.data[string_name] = strata_groups.labels_ return self.data
def ap_cluster_and_plot(row_number, row): filename = str(row_number) + "-" + str(row[0]) + "-color" + ".png" #matrix = np.matrix(np.array(row[1:])).reshape(28,28) #plot(filename, matrix) data_entry = row_to_data(row[1:]) af = AP(verbose=True, damping=0.5, max_iter=1000).fit(data_entry) al = af.labels_ ac = af.cluster_centers_indices_ print "Row:",row_number," Digit:",row[0],"Clusters: ", len(ac), ":", len(al) plot_color(filename, data_entry, al)
def propagacion_de_afinidad(T): model = AP( affinity='euclidean' ) #lo unico que determinamos nosotros es la afinidad, si la afinidad es 'precomputed' debemos introducir en fit nuestra matrix de preferencias en lugar de T model.fit( T) #preparamos la matriz de distancias y la obtencion de centroides targets = model.predict( T ) #encontramos la posicion estable de todos los datos en sus respectivos clusters return targets
def find_number_of_sources(cosine_distance): cos_dist = np.resize(cosine_distance, new_shape=(len(cosine_distance), len(cosine_distance))) ap = AP(affinity='precomputed').fit(cos_dist) counter = Counter(ap.labels_).most_common() source = 0 for i in range(len(counter)): if counter[i][1] == counter[0][1]: source += 1 return source
def __affinityPropagation(self): print("Affinity Propagation Clustering on PCA-reduced data") reduced_data = PCA(n_components=2).fit_transform(self.learn[2]) af = AP().fit(reduced_data) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ #print the image file names in the terminal for obj in self.images: for key, value in obj.items(): if (value[key] in cluster_centers_indices): print("cp " + key + " ../APClustersImagesFile/") self.affPropResult = af self.affPropData = reduced_data n_clusters_ = len(cluster_centers_indices) plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = reduced_data[cluster_centers_indices[k]] plt.plot(reduced_data[class_members, 0], reduced_data[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in reduced_data[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title( 'Affinity Propagation clustering on the 9 dataset (PCA-reduced data)' ) path = self.path + '/ClusteringImages/AffinityPropagation/Number9.png' plt.savefig(path)
import scipy random.seed(1234) HOLDING_PERIOD = 20 d = mld.getData() X, y, FeatureNames, returns, actual_y = mld.createMLData(HOLDING_PERIOD) clusters = [ "KMeans", "GMM", "SpectralClustering", "AffinityProp", "PCA", "ICA", "NMF" ] n = 8 clustering = np.array([ KMeans(n_clusters=n, max_iter=500), GMM(n_components=n, n_iter=1000, n_init=15, random_state=1), SpectralClustering(n_clusters=n), AP(damping=0.5, max_iter=200), ]) num = 10 reduction = np.array([ PCA(n_components=num), FastICA(n_components=num), NMF(n_components=num, max_iter=500), FA(n_components=num, max_iter=500) ]) # Define classifier class_id = 0 red_id = 3 clf = clustering[class_id] red = reduction[red_id]
logging.info("RMSD calculation time is %s" % t) t = dt.datetime.now() logging.info("Starting clusterization") #Now clusterization routine from sklearn.cluster import AffinityPropagation as AP # #S = np.memmap('aff.raw', dtype='float32', shape=(ln, ln), mode='write') ##Prepare values #S = -1 * matrix ** 2 ##Calc average for logfile #d_avg = np.mean(cl_matrix) # # ##Convert all values to float af = AP(affinity="precomputed", max_iter=2000, convergence_iter=50).fit(cl_matrix) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) with open('aff.res', 'w') as out: for i in range(ln): out.write("%s\t%d\n" % (pdb_list[i], labels[i])) with open('aff.ref', 'w') as out: for i in cluster_centers_indices: out.write('%s\n' % pdb_list[i]) logging.info("Info about run:") #logging.info('%s: %s' % ("Average distance", d_avg)) logging.info('%s: %d' % ("Number of clusters", n_clusters_)) logging.info('%s\t%s' % ("Center of cluster", "Number of cluster"))