# 导入数据 data = pd.read_csv("three_class_data.csv", header=0) x = data[["x", "y"]] # 对聚类方法依次命名 cluster_names = ['KMeans', 'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'AgglomerativeClustering', 'Birch', 'DBSCAN'] # 确定聚类方法相应参数 cluster_estimators = [ cluster.KMeans(n_clusters=3), cluster.MiniBatchKMeans(n_clusters=3), cluster.AffinityPropagation(), cluster.MeanShift(), cluster.SpectralClustering(n_clusters=3), cluster.AgglomerativeClustering(n_clusters=3), cluster.Birch(n_clusters=3), cluster.DBSCAN() ] # 为绘制子图准备 plot_num = 1 # 依次运行不同的聚类方法 for name, model in zip(cluster_names, cluster_estimators): tic = time.time() # 建立模型 model.fit(x)
connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full')
def run_config(fname, data, index, algo): #print('Launching', fname, file=sys.stderr) today = datetime.datetime.now() print(today.strftime("%Y-%m-%d %H.%M.%S") ) # 2017-04-05-00.18.00 with open(fname, 'a') as result: try: X = [] with open(data, 'r') as f: content = f.readlines() for x in content: row = x.split() res = [] for i in row: res.append(float(i)) X.append(res) #n_clusters = 15 X1 = StandardScaler().fit_transform(np.array(X)) #connectivity = kneighbors_graph(X1, n_neighbors=2, include_self=False) res = cluster.SpectralClustering(eigen_solver="arpack", affinity="nearest_neighbors").fit(X1) labels = res.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) cl = clusterization(X1, labels, n_clusters, index) m = cl.init_measure() for run_num in range(0, 3): #iterable CVI computation strategy = algo(deepcopy(cl), m) new_measure_iter, iters, t = strategy.run() # print number of run result.write("Run {}\n".format((run_num + 1))) #print('Launching iterable computation: ', fname, file=sys.stderr) #result.write('Launching iterable computation: ' + fname) result.write("Measure improvement {}\n".format(abs(m - new_measure_iter))) result.write("from {}\n".format(m)) result.write("to {}\n".format(new_measure_iter)) result.write("Iterations performed {}\n".format(iters)) result.write("Time spent {}\n".format(t)) # full CVI computation with time limit strategy = algo(deepcopy(cl), m) new_measure_full, iters, t = strategy.run_full() #print('Launching full without CVI limit: ', fname, file=sys.stderr) #result.write('Launching full without CVI limit: ' + fname) result.write("Measure improvement {}\n".format(abs(m - new_measure_full))) result.write("from {}\n".format(m)) result.write("to {}\n".format(new_measure_full)) result.write("Iterations performed {}\n".format(iters)) result.write("Time spent {}\n".format(t)) # full CVI computation with measure limit on CVI value # obtained from iterable computation launch strategy = algo(deepcopy(cl), m) new_measure_full_CVI_limit, iters, t = strategy.run_full_CVI_limit(new_measure_iter) #print('Launching full with CVI limit: ', fname, file=sys.stderr) #result.write('Launching full with CVI limit: ' + fname) result.write("Measure improvement {}\n".format(abs(m - new_measure_full_CVI_limit))) result.write("from {}\n".format(m)) result.write("to {}\n".format(new_measure_full_CVI_limit)) result.write("Iterations performed {}\n".format(iters)) result.write("Time spent {}\n\n".format(t)) except: traceback.print_exc(file=result)
def test_clustering(data, lons, lats, N_CLUSTERS): pred_dict = {} np.random.seed(0) n_samples = 1500 colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) plt.figure(figsize=(20, 15)) plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 for i_dataset, dataset in enumerate([data]): X = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # Compute distances #distances = np.exp(-euclidean_distances(X)) distances = euclidean_distances(X) # create clustering estimators ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=N_CLUSTERS) ward = cluster.AgglomerativeClustering(n_clusters=N_CLUSTERS, linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=N_CLUSTERS, eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=.2) affinity_propagation = cluster.AffinityPropagation(damping=.9, preference=-200) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=N_CLUSTERS, connectivity=connectivity) for name, algorithm in [('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan)]: # predict cluster memberships t0 = time.time() algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) # plot plt.subplot(4, 7, plot_num) if i_dataset == 0: plt.title(name, size=18) plt.scatter(lons, lats, color=colors[y_pred].tolist(), s=10) if hasattr(algorithm, 'cluster_centers_'): try: centers = algorithm.cluster_centers_ center_colors = colors[:len(centers)] plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors) except: continue plt.xlim(min(lons) - 1, max(lons) + 1) plt.ylim(min(lats) - 1, max(lats) + 1) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 pred_dict[name] = [lons, lats, colors[y_pred].tolist()] plt.show() return pred_dict
def cluster_matrices(submatrices_dict, k, method='kmeans', how='full'): """ clusters the submatrices per chromosome Parameters ---------- submatrices_dict key: chrom name, values, a list of submatrices k number of clusters method either kmeans, hierarchical or spectral how how to cluster. Options are 'full', 'center' and 'diagonal'. More info in the argparse options Returns ------- indices dict key: chrom_name, value: list of list, with one list per cluster with the ids of the submatrices that belong to that list """ clustered_dict = {} for chrom in submatrices_dict: log.info("Length of entry: {}".format(len(submatrices_dict[chrom]))) if len(submatrices_dict[chrom]) < k: log.info("number of the submatrices on chromosome {} is less than {}. Clustering is skipped.".format(chrom, k)) k = 1 submat_vectors = [] shape = submatrices_dict[chrom][0].shape center_bin = (shape[0] + 1) // 2 for submatrix in submatrices_dict[chrom]: if how == 'diagonal': # take from each matrix the diagonal submat_vectors.append(submatrix.diagonal()) elif how == 'center': # take the mean of a smaller submatrix of 3 x 3 centered on the submatrix submat_vectors.append( submatrix[center_bin - 2:center_bin + 1, center_bin - 2:center_bin + 1].reshape((1, 9)).mean()) else: # Transform list of submatrices in an array of shape: # shape = (num_submatrices, submatrix.shape[0] * submatrix.shape[1] # In other words, each submatrix is converted into a row of the matrix submat_vectors.append(submatrix.reshape((1, shape[0] * shape[1]))) matrix = np.vstack(submat_vectors) if how == 'diagonal': assert matrix.shape == (len(submatrices_dict[chrom]), shape[0]) elif how == 'center': assert matrix.shape == (len(submatrices_dict[chrom]), 1) else: assert matrix.shape == (len(submatrices_dict[chrom]), shape[0] * shape[1]) # remove outliers out_ind = get_outlier_indices(matrix, max_deviation=2) if out_ind is not None and len(np.flatnonzero(out_ind)) > 0: log.info("Outliers detected in chrom: {}. Number of outliers: {}". format(chrom, len(np.flatnonzero(out_ind)))) # keep in matrix all indices that are not outliers matrix = matrix[np.logical_not(out_ind), :] if np.any(np.isnan(matrix)): # replace nans for 0 otherwise kmeans produces a weird behaviour log.warning("For clustering nan values have to be replaced by zeros.") matrix[np.isnan(matrix)] = 0 if method == 'kmeans': clustering = skclust.KMeans(n_clusters=k, random_state=0).fit(matrix) cluster_labels = clustering.labels_ if method == 'hierarchical': clustering = skclust.AgglomerativeClustering(n_clusters=k).fit(matrix) cluster_labels = clustering.labels_ if method == 'spectral': clustering = skclust.SpectralClustering(n_clusters=k, assign_labels="discretize", random_state=0).fit(matrix) cluster_labels = clustering.labels_ # sort clusters clustered_dict[chrom] = [] for cluster in range(k): cluster_ids = np.flatnonzero(cluster_labels == cluster) clustered_dict[chrom].append(cluster_ids) return clustered_dict
print("\nDATASET NORMALIZADO:\n") print(X) # técnicas de agrupamento, NECESSÁRIO ESTUDAR E OTIMIZAR OS PARÂMETROS DE CADA TÉCNICA two_means = cluster.MiniBatchKMeans(n_clusters=2, init='random', n_init=10, max_iter=300, tol=1e-04, random_state=0) bandwidth = cluster.estimate_bandwidth(X, quantile=0.95) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) spectral = cluster.SpectralClustering(n_clusters=8, eigen_solver='arpack', affinity="nearest_neighbors", random_state=0) connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) ward = cluster.AgglomerativeClustering(n_clusters=3, linkage='ward', connectivity=connectivity) average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=3, connectivity=connectivity) dbscan = cluster.DBSCAN(eps=5) birch = cluster.Birch(n_clusters=3, threshold=0.7) gmm = mixture.GaussianMixture(n_components=2,
def do(): ai = AI() ai.load() # ai.learn() params = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } bandwidth = cluster.estimate_bandwidth(ai.x, quantile=params['quantile']) connectivity = kneighbors_graph(ai.x, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = (('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('Birch', birch), ('GaussianMixture', gmm)) for name, algorithm in clustering_algorithms: with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) try: algorithm.fit(ai.x) except Exception as e: continue if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(numpy.int) else: y_pred = algorithm.predict(ai.x) if max(y_pred) > 3: continue known_groups = {} for i, group in enumerate(ai.y): group = int(group) if group not in known_groups: known_groups[group] = [] known_groups[group].append(i) guessed_groups = {} for i, group in enumerate(y_pred): if group not in guessed_groups: guessed_groups[group] = [] guessed_groups[group].append(i) for k in known_groups: for g in guessed_groups: print( k, g, len(set(known_groups[k]).intersection(guessed_groups[g])))
kchosen = 15 # based on visual inspection of distortion.png and spectrum.png eigvecs_k, codebook, distortion = results[kchosen] print 'chose k=', kchosen, ' distortion ', distortion membership, _ = scipy.cluster.vq.vq(eigvecs_k, codebook) for i in xrange(kchosen): print 'cluster', i print ','.join(names[membership == i]) names, X = load_data() ga_ind = np.flatnonzero(names == 'GA') me_ind = np.flatnonzero(names == 'ME') X[ga_ind, me_ind] = 0. #hackityhackhackhack X[me_ind, ga_ind] = 0. spectral = cluster.SpectralClustering(n_clusters=16, eigen_solver='arpack', affinity="precomputed") spectral.fit(X) spectral.labels_ print '' for label in np.unique(spectral.labels_): clust = names[spectral.labels_ == label] for cl in clust: if cl == clust[-1]: print cl, else: print cl + ",", print '' if __name__ == "__main__": spectral_cluster(names, X)
# In[75]: print("Explained variance: ", explained_variance_score(y, predictions)) # In[76]: print("R2 score: ", r2_score(y, predictions)) # ### Clustering # In[77]: from sklearn import cluster spectral = cluster.SpectralClustering(n_clusters=4, eigen_solver='arpack', affinity='nearest_neighbors') # In[78]: spectral.fit(boston.data) # In[79]: boston_df['category'] = spectral.labels_ boston_df['price'] = boston.target house_clusters = boston_df.groupby('category').mean().sort_values('price') house_clusters.index = ['low', 'mid_low', 'mid_high', 'high'] house_clusters[['price', 'CRIM', 'RM', 'AGE', 'DIS']]
def main(csps_data): global db_time, feature_set, global_args # n_clusters = cluster_options['n_clusters'] # id measure response year organisation group score # print(csps_data.head()) # First get data fram into the right shape if (feature_set == 'demographics'): csps_data = csps_data.set_index(['organisation', 'org', 'year']) else: csps_data = pd.pivot_table( csps_data, values='score', index=['organisation', 'year', 'headcount', 'org', 'par'], columns=['measure'], aggfunc=np.sum) #csps_data = pd.pivot_table(csps_data, values='score', index=['organisation', 'year', 'org', 'par'], columns=['measure', 'headcount'], aggfunc=np.sum) #print(feature_set) #print(csps_data.head()) # Now, because the EEI is required later and therefore retrieved, but is to be excluded from the questions and demographics, split the EEI column out and delete eei = csps_data['EEI'] #print(eei.tolist()) if (feature_set != 'zzzzthemes'): csps_data = csps_data.drop('EEI', 1) #print( '*' * 44 ) #print(csps_data.head()) # The data should always be a 2D array, shape (n_samples, n_features) # print(csps_data.head()) # To get the boolean mask where values are nan # cpvnm: CSPS data, pivoted, null mask # csps_data = pd.isnull(csps_data) # print(csps_data.head()) ''' if (feature_set == 'themes'): dist_test1 = csps_data['EEI'].tolist() dist_test2 = csps_data['MW'].tolist() # print(dist_test.head()) # dist_test.reset_index(True) # print(dist_test.head()) print(dist_test1) print(dist_test2) zz = zip(dist_test1, dist_test2) print(map(list, zz)) from sklearn.metrics.pairwise import euclidean_distances X_pairs = [[0, 1], [1, 1]] # distance between rows of X print(euclidean_distances(dist_test1, dist_test2)) # print(euclidean_distances(X_pairs, X_pairs)) # array([[ 0., 1.], [ 1., 0.]]) # get distance to origin # print(euclidean_distances(X_pairs, [[0, 0]])) # array([[ 1. ], [ 1.41421356]]) ''' #print(csps_data.columns) # Filling missing data: CSPS data, pivoted, no-null # csps_data = csps_data.fillna(value=0) # print(csps_data.head()) #'KMeans', 'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'AgglomerativeClustering', 'DBSCAN', 'Birch' # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(csps_data) start_cluster_time = timer() # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) if (algorithm == 'KMeans'): clustered = cluster.KMeans(n_clusters=cluster_options['n_clusters']) elif (algorithm == 'MiniBatchKMeans'): clustered = cluster.MiniBatchKMeans( n_clusters=cluster_options['n_clusters']) elif (algorithm == 'AffinityPropagation'): clustered = cluster.AffinityPropagation() elif (algorithm == 'MeanShift'): bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) clustered = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif (algorithm == 'SpectralClustering'): clustered = cluster.SpectralClustering( n_clusters=cluster_options['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") elif (algorithm == 'AffinityPropagation'): clustered = cluster.AffinityPropagation(damping=.9, preference=-200) elif (algorithm == 'AgglomerativeClustering'): clustered = cluster.AgglomerativeClustering( linkage='ward', n_clusters=cluster_options['n_clusters'], connectivity=connectivity) elif (algorithm == 'AC_average_linkage'): clustered = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=cluster_options['n_clusters'], connectivity=connectivity) elif (algorithm == 'DBSCAN'): clustered = cluster.DBSCAN(eps=.5, algorithm='auto', leaf_size=40) elif (algorithm == 'Birch'): clustered = cluster.Birch(n_clusters=cluster_options['n_clusters']) else: clustered = cluster.KMeans(n_clusters=cluster_options['n_clusters']) clustered.fit(X) if (algorithm == 'MeanShift' or algorithm == 'DBSCAN'): silhouette_score = -1 else: silhouette_score = metrics.silhouette_score(X, clustered.labels_, metric='euclidean') # neigh = NearestNeighbors(2, 0.4) # neigh.fit(X) # NearestNeighbors(algorithm='auto', leaf_size=30) # nbrs = neigh.radius_neighbors([[0, 0, 1.3]], 0.4, return_distance=False) # rng = neigh.radius_neighbors([X[1]]) # print('NearestNeighbors') # print(X.shape[1]) # print(np.asarray(rng[0][0])) end_cluster_time = timer() # this works, but isn't useful any more # csps_data['cluster_id'] = clustered.labels_ if (feature_set != 'demographics'): # org_year = zip(*csps_data.index.values) #['organisation', 'year', 'org', 'par'] # orgs = pd.Series(org_year[0]) # years = pd.Series(org_year[1]) # org_acronym = pd.Series(org_year[2]) # par_acronym = pd.Series(org_year[3]) # clusters = pd.Series(clustered.labels_.tolist()) org_year = zip(*csps_data.index.values ) #['organisation', 'year', 'headcount', 'org', 'par'] orgs = pd.Series(org_year[0]) years = pd.Series(org_year[1]) headcount = pd.Series(org_year[2]) org_acronym = pd.Series(org_year[3]) par_acronym = pd.Series(org_year[4]) clusters = pd.Series(clustered.labels_.tolist()) else: org_year = zip( *csps_data.index.values) #['organisation', 'org', 'year'] orgs = pd.Series(org_year[0]) years = pd.Series(org_year[2]) org_acronym = pd.Series(org_year[1]) clusters = pd.Series(clustered.labels_.tolist()) org_year.append(clustered.labels_.tolist()) #1 - organisation df = pd.DataFrame(orgs) #, 'organisation' #2 - year df['year'] = years #3 - headcount if (feature_set != 'demographics'): df['headcount'] = headcount else: df['headcount'] = np.array([0] * len(df)) #csps_data['headcount'] #4 - cluster id df['cluster'] = clusters #5 - acronym df['org'] = org_acronym #6 - parent if (feature_set != 'demographics'): df['parent'] = par_acronym else: df['parent'] = np.array(['x'] * len(df)) #7 - EEI df['EEI'] = eei.tolist() # if (feature_set != 'themes'): # df['EEI'] = eei.tolist() # else: # df['EEI'] = csps_data['EEI'] category_labels = ['EEI', 'headcount', 'year'] # descriptive statistics for each cluster #df[df.A > 0] #df.groupby('cluster') #cluster_info = df.groupby(['cluster']).get_group(1) #grouped = df(['EEI', 'headcount', 'cluster']).groupby('cluster') grouped = df.groupby('cluster') cluster_info = grouped.describe().fillna('missing') # for name, group in grouped: # print(name) # print(group) #df = df.sort_values(by='cluster') # use describe to show quick summary statistics of the data #df.describe(); end_time = timer() cluster_time = (end_cluster_time - start_cluster_time) total_time = (end_time - start_time) if (algorithm == 'AffinityPropagation'): other_output = json.dumps([{ 'silhouette_score': silhouette_score, 'db_time': db_time, 'cluster_time': cluster_time, 'total_time': total_time, 'feature_set': feature_set, 'cluster_info': cluster_info.values.tolist(), 'category_labels': category_labels }, clustered.cluster_centers_indices_.tolist() ]) output = json.dumps([{ 'silhouette_score': silhouette_score, 'db_time': db_time, 'cluster_time': cluster_time, 'total_time': total_time, 'feature_set': feature_set, 'cluster_info': cluster_info.values.tolist(), 'category_labels': category_labels }, df.values.tolist()]) # output = json.dumps(other_output) print output
def learn_Clustering(args): model = cluster.SpectralClustering(n_clusters=3) model.fit(args["data_in"].data.numpy()) print("nClusters view " + str(args["id_in"]) + " : " + str(len(set(model.labels_)))) return model
# encode categorical data and assign X to matrix of floats X = pd.get_dummies(X, prefix=['cg', 'vr', 'or']).astype(float) # Methods wich don't need K as an input # 1. Affinity propagation aprop = skc.AffinityPropagation().fit_predict(X) # 4. DBSCAN dbscan = skc.DBSCAN().fit_predict(X) # Define number of clusters for those methdos that need it K = 3 # 0. Kemeans kmeans = skc.KMeans(n_clusters=K).fit_predict(X) # 2. Spectral clustering spclus = skc.SpectralClustering(n_clusters=K).fit_predict(X) # 3. Agglomerative clustering aggclus = skc.AgglomerativeClustering(n_clusters=K).fit_predict(X) # Create data frame cols = ['Kmeans', 'Spec', 'Agglo'] clusters = pd.DataFrame(np.vstack((kmeans, spclus, aggclus)).T, columns=cols) #clusters.Aff_Prop.unique().plot() #len(np.unique(aprop)) # Functions to use in the jupyter notebook def describe_no_K_needed(): print(20 * '==') print('Affinity Propagation found ' + str(len(np.unique(aprop))) +
#k-means from sklearn.cluster import KMeans clf_kmeans = KMeans(n_clusters=5) kmeans_cluster = clf_kmeans.fit_predict(all_df) #heirarchical clustering from sklearn import cluster clf_hc = cluster.AgglomerativeClustering(n_clusters=4) hc_cluster = clf_hc.fit_predict(all_df) #DBSCAN clf_dbscan = cluster.DBSCAN(eps=0.4) db_cluster = clf_dbscan.fit_predict(all_df) #spectural clustering clf_sc = cluster.SpectralClustering(n_clusters=4,n_neighbors=20) sc_cluster = clf_sc.fit_predict(all_df) #test test = pd.read_csv('test.csv') test = test[['0','1']].values kmeans_predict = list() hc_predict = list() db_predict = list() sc_predict = list() for i in range(400): if kmeans_cluster[(test[i][0])] == kmeans_cluster[(test[i][1])]: kmeans_predict.append(1) elif kmeans_cluster[(test[i][0])] != kmeans_cluster[(test[i][1])]: kmeans_predict.append(0)
model = GaussianMixture(n_components=nclust,init_params='kmeans') model.fit(X) clust_labels3 = model.predict(X) return (clust_labels3) #y_pred = doGMM(data2,4) def MeanShift(x,y): ms=cluster.MeanShift(x) ms_result=ms.fit_predict(y) return(ms_result) #y_pred=MeanShift(0.1,data2) def MiniKmeans(x, y): mb= cluster.MiniBatchKMeans(x) mb_result=mb.fit_predict(y) return(mb_result) #y_pred = MiniKmeans(4,data) spectral = cluster.SpectralClustering(n_clusters=4) #y_pred= spectral.fit_predict(data2) def Dbscan(x, y): db=cluster.DBSCAN(eps=x) db_result=db.fit_predict(y) return(db_result) #y_pred = Dbscan(0.3,data2) def Affinity(x, y,z): ap=cluster.AffinityPropagation(damping=x, preference=y) ap_result=ap.fit_predict(z) return(ap_result) #y_pred = Affinity(0.9,-200,data2) #Birch Clustering def Bir(x, y): bi=cluster.Birch(n_clusters=x) bi_result=bi.fit_predict(y)
sys.path.append(path) import numpy as np from sklearn import cluster, metrics from common_utils import * from clustering_utils import * from classification_utils import * scoring = 's_score' X, y = generate_synthetic_data_2d_clusters(n_samples=300, n_centers=4, cluster_std=0.60) plot_data_2d(X) spectral_estimator = cluster.SpectralClustering(affinity='nearest_neighbors', assign_labels='kmeans') spectral_grid = {'n_clusters': list(range(3, 7))} grid_search_plot_models_2d_clustering(spectral_estimator, spectral_grid, X) grid_search_plot_one_parameter_curves_clustering(spectral_estimator, spectral_grid, X, scoring=scoring) spectral_final_model = grid_search_best_model_clustering(spectral_estimator, spectral_grid, X, scoring=scoring) plot_model_2d_clustering(spectral_final_model, X) X, y = generate_synthetic_data_3d_clusters(n_samples=300, n_centers=5, cluster_std=0.60)
def clustering(Xsvd, cells, dataset, suffix, labels=None, tlabels=None, method='knn', istsne=True, name='', batch_labels=None, seed=42): tsne = TSNE(n_jobs=24).fit_transform(Xsvd) for n_components in [15]: if method == 'gmm': clf = mixture.GaussianMixture(n_components=n_components).fit(mat) labels_pred = clf.predict(tsne) elif method == 'knn': labels_pred = KMeans(n_components, n_init=200).fit_predict(tsne) # n_jobs>1 ? elif method == 'dbscan': labels_pred = DBSCAN(eps=0.3, min_samples=10).fit(tsne).labels_ elif method == 'spectral': spectral = cluster.SpectralClustering(n_clusters=n_components, eigen_solver='arpack', affinity="nearest_neighbors") labels_pred = spectral.fit_predict(tsne) elif method == 'louvain': from scipy.spatial import distance for louvain in [30]: print('****', louvain) mat = kneighbors_graph(Xsvd, louvain, mode='distance', include_self=True).todense() G = nx.from_numpy_matrix(mat) partition = community.best_partition(G, random_state=seed) labels_pred = [] for i in range(mat.shape[0]): labels_pred.append(partition[i]) labels_pred = np.array(labels_pred) print('louvain', louvain, tsne[:5], len(labels), len(labels_pred)) #print(np.unique(labels_pred)) if labels is not None: nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) print( n_components, method, "Clustering Scores:\nNMI: %.4f\nARI: %.4f\n" % (nmi_score, ari_score)) if istsne: n_components = len(np.unique(labels_pred)) vis_x = tsne[:, 0] vis_y = tsne[:, 1] colors = [ 'blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'yellow', 'black', 'teal', 'plum', 'tan', 'bisque', 'beige', 'slategray', 'brown', 'darkred', 'salmon', 'coral', 'olive', 'lightpink', 'teal', 'darkcyan', 'BlueViolet', 'CornflowerBlue', 'DarkKhaki', 'DarkTurquoise' ] show_tsne(tsne, labels, 'result/%s/%s-%s-LSI-true.png' % (dataset, name, suffix), tlabels=tlabels) show_tsne(tsne, labels_pred, 'result/%s/%s-%s-LSI-pred.png' % (dataset, name, suffix)) with open('result/%s-LSI-cluster_result.csv' % (dataset), 'w') as f: f.write('cell,predicted label,tsne-1,tsne-2\n') for cell, pred, t in zip(cells, labels_pred, tsne): f.write('%s,%d,%f,%f\n' % (cell, pred, t[0], t[1])) if batch_labels is not None: show_tsne( tsne, batch_labels, 'result/%s/%s-GMVAE-%s-%s-batch.png' % (dataset, dataset, suffix, name))
import matplotlib.colors as colors # In[8]: n_samples = 500 varied = pd.DataFrame( datasets.make_blobs(n_samples=n_samples, centers=4, cluster_std=[1.0, 2.5, 1, 1], random_state=5)[0]) # In[9]: kmeans = cluster.KMeans(n_clusters=4) ward = cluster.AgglomerativeClustering(n_clusters=4) spectral = cluster.SpectralClustering() dbscan = cluster.DBSCAN() affinity_propagation = cluster.AffinityPropagation() birch = cluster.Birch(n_clusters=4) gmm = mixture.GaussianMixture(n_components=4) # In[10]: algo = (('kmeans', kmeans), ('Agnes-ward', ward), ('spectral', spectral), ('dbscan', dbscan), ('affinity_propagation', affinity_propagation), ('birch', birch)) # In[11]: label = 'kmeans'
# sig1 = data_V[i,0:150] # sig2 = data_V[j,0:150] # sigp1 = data_V[i,150:] # sigp2 = data_V[j,150:] # cc1[i,j] = max(np.correlate(sig1,sig2)/(sum(sig1**2)*sum(sig2**2))**0.5) # cc1[j,i] = cc1[i,j] # cc2[i,j] = max(np.correlate(sigp1,sigp2)/(sum(sigp1**2)*sum(sigp2**2))**0.5) # cc2[j,i] = cc2[i,j] # #dis = np.zeros((l,l)) #dis = 1-cc2 #np.save('disM',dis) from sklearn import cluster db = cluster.SpectralClustering(n_clusters=10, affinity='precomputed').fit(dis) labels = db.labels_ print(max(labels)) for j in range(max(labels) + 1): ind = np.where(labels == j)[0] print(len(ind)) stack_signal = np.median(data_V[ind, :], axis=0) plt.figure() for ii in range(len(ind)): plt.plot(data_V[ind[ii], 0:150], c=[0, 0, 0.6, 0.1], linewidth=0.5) plt.plot(data_V[ind[ii], 150:], c=[0.6, 0, 0, 0.1], linewidth=0.5) plt.plot(stack_signal[0:150], 'b') plt.plot(data_V[ind[0], 0:150], 'b--') plt.plot(stack_signal[150:], 'r')
# Load and Store both data and groundtruth of Zachary's Karate Club G = nx.karate_club_graph() groundTruth = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # Transform our graph data into matrix form edgeMat = graphToEdgeMatrix(G) # Positions the nodes using Fruchterman-Reingold force-directed algorithm # Too technical to discuss right now, just go with it pos = nx.spring_layout(G) drawCommunities(G, listToDict(groundTruth), pos) # ----------------------------------------- # Spectral Clustering Model spectral = cluster.SpectralClustering(n_clusters=kClusters, affinity="precomputed", n_init=200) spectral.fit(edgeMat) # Transform our data to list form and store them in results list results.append(list(spectral.labels_)) # ----------------------------------------- # Agglomerative Clustering Model agglomerative = cluster.AgglomerativeClustering(n_clusters=kClusters, linkage="ward") agglomerative.fit(edgeMat) # Transform our data to list form and store them in results list results.append(list(agglomerative.labels_)) # -----------------------------------------
y_true = [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] edge_mat = graph_to_edge_matrix(G) k_clusters = 2 results = [] algorithms = {} algorithms['kmeans'] = cluster.KMeans(n_clusters=k_clusters, n_init=200) algorithms['agglom'] = cluster.AgglomerativeClustering(n_clusters=k_clusters, linkage="ward") algorithms['spectral'] = cluster.SpectralClustering(n_clusters=k_clusters, affinity="precomputed", n_init=200) algorithms['affinity'] = cluster.AffinityPropagation(damping=0.6) for model in algorithms.values(): model.fit(edge_mat) results.append(list(model.labels_)) kmeans = cluster.KMeans(n_clusters=k_clusters, n_init=200) kmeans.fit(graph_to_edge_matrix(G)) # Transform our data to list form and store them in results list results.append(list(kmeans.labels_)) draw_communities(G, list(kmeans.labels_), pos)
def _cluster(self, acts, method='KM', param_dict=None): """Runs unsupervised clustering algorithm on concept actiavtations. Args: acts: activation vectors of datapoints points in the bottleneck layer. E.g. (number of clusters,) for Kmeans method: clustering method. We have: 'KM': Kmeans Clustering 'AP': Affinity Propagation 'SC': Spectral Clustering 'MS': Mean Shift clustering 'DB': DBSCAN clustering method param_dict: Contains superpixl method's parameters. If an empty dict is given, default parameters are used. Returns: asg: The cluster assignment label of each data points cost: The clustering cost of each data point centers: The cluster centers. For methods like Affinity Propagetion where they do not return a cluster center or a clustering cost, it calculates the medoid as the center and returns distance to center as each data points clustering cost. Raises: ValueError: if the clustering method is invalid. """ if param_dict is None: param_dict = {} centers = None if method == 'KM': n_clusters = param_dict.pop('n_clusters', 25) km = cluster.KMeans(n_clusters) d = km.fit(acts) centers = km.cluster_centers_ d = np.linalg.norm( np.expand_dims(acts, 1) - np.expand_dims(centers, 0), ord=2, axis=-1) asg, cost = np.argmin(d, -1), np.min(d, -1) elif method == 'AP': damping = param_dict.pop('damping', 0.5) ca = cluster.AffinityPropagation(damping) ca.fit(acts) centers = ca.cluster_centers_ d = np.linalg.norm( np.expand_dims(acts, 1) - np.expand_dims(centers, 0), ord=2, axis=-1) asg, cost = np.argmin(d, -1), np.min(d, -1) elif method == 'MS': ms = cluster.MeanShift(n_jobs=self.num_workers) asg = ms.fit_predict(acts) elif method == 'SC': n_clusters = param_dict.pop('n_clusters', 25) sc = cluster.SpectralClustering( n_clusters=n_clusters, n_jobs=self.num_workers) asg = sc.fit_predict(acts) elif method == 'DB': eps = param_dict.pop('eps', 0.5) min_samples = param_dict.pop('min_samples', 20) sc = cluster.DBSCAN(eps, min_samples, n_jobs=self.num_workers) asg = sc.fit_predict(acts) else: raise ValueError('Invalid Clustering Method!') if centers is None: ## If clustering returned cluster centers, use medoids centers = np.zeros((asg.max() + 1, acts.shape[1])) cost = np.zeros(len(acts)) for cluster_label in range(asg.max() + 1): cluster_idxs = np.where(asg == cluster_label)[0] cluster_points = acts[cluster_idxs] pw_distances = metrics.euclidean_distances(cluster_points) centers[cluster_label] = cluster_points[np.argmin( np.sum(pw_distances, -1))] cost[cluster_idxs] = np.linalg.norm( acts[cluster_idxs] - np.expand_dims(centers[cluster_label], 0), ord=2, axis=-1) return asg, cost, centers
def cluster_business(businesses): NClusters = 50 np.random.seed(0) # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times n_samples = 1500 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) plt.figure(1) plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 X = np.ndarray(shape=(0, 2)) count = 0 for b in businesses: X = vstack([X, [b.longitude, b.latitude]]) # if(count>1000): # break count += 1 # print type(X) # print X k_means = cluster.MiniBatchKMeans(n_clusters=NClusters) dbscan = cluster.DBSCAN(eps=.03) affinity_propagation = cluster.AffinityPropagation(damping=.9, preference=-200) spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors") for name, algorithm in [ ('MiniBatchKMeans', k_means), # ('DBSCAN', dbscan), # ('SpectralClustering', spectral) # ('AffinityPropagation', affinity_propagation), ]: # predict cluster memberships t0 = time.time() algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) # plot ax = plt.subplot(1, 2, plot_num) plt.title(name, size=16) plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10) if hasattr(algorithm, 'cluster_centers_'): centers = algorithm.cluster_centers_ center_colors = colors[:len(centers)] plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) plt.xticks(()) plt.yticks(()) plot_num += 1 plt.show() clusters = [] for index in range(NClusters): clusters.append(Cluster([])) for index in range(len(businesses)): businesses[index].cluster_id = y_pred[index] clusters[y_pred[index]].businesses.append(businesses[index]) return clusters
def spectral(feat, n_clusters, **kwargs): spectral = cluster.SpectralClustering(n_clusters=n_clusters, assign_labels="discretize", affinity="nearest_neighbors", random_state=0).fit(feat) return spectral.labels_
def cluster_annotation(long_turns, embeddings, speakers, algorithm='SpectralClustering'): X = [] for segment in long_turns: # "strict" only keeps embedding strictly included in segment x = embeddings.crop(segment, mode='strict') # average speech turn embedding X.append(np.mean(x, axis=0)) X = np.vstack(X) # apply PCA on embeddings imp = SimpleImputer(missing_values=np.nan, strategy='mean') X = imp.fit_transform(X) if (X.shape[1] == 0): return Annotation(), [], [] no_clusters = int(speakers) if no_clusters == 0: range_n_clusters = list(range(2, 10)) silhouette_dict = {} for n_clusters in range_n_clusters: clusterer = cluster.SpectralClustering(n_clusters=n_clusters) cluster_labels = clusterer.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) silhouette_dict[n_clusters] = silhouette_avg if (all(value == 0 for value in silhouette_dict.values())): no_clusters = 2 else: max_val = 0 max_index = 0 for clusters in silhouette_dict: if (silhouette_dict[clusters] > max_val): max_val = silhouette_dict[clusters] max_index = clusters no_clusters = max_index c = select_cluster_algorithm(algorithm, no_clusters) labels = c.fit_predict(X) labeled_data = [] for i, turn in enumerate(long_turns): labeled_data.append([labels[i], turn]) annotation = Annotation() for i in labeled_data: label = int(i[0]) segment = i[1] annotation[segment] = label return annotation
# remove duplicate entities detected entity_text_array = np.unique(entity_text_array) # Construct TfidVectorizer vect = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word', stop_words='english', vocabulary=entity_text_array) corpus_tf_idf = vect.fit_transform(corpus) # change n_clusters to equal the number of clusters desired n_clusters = 7 n_components = n_clusters #spectral clustering spectral = cluster.SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors", n_neighbors=17) spectral.fit(corpus_tf_idf) if hasattr(spectral, 'labels_'): cluster_assignments = spectral.labels_.astype(np.int) for i in range(0, 40): #len(cluster_assignments)) # removed topic cluster here because the site I used (yahoo) # didn't have very good topics by default print('Document number : {}'.format(i)) print('Cluster Assignment : {}'.format(cluster_assignments[i])) print('Document title : {}'.format(titles_array[i])) print('------------------------')
evoked = epochs['1'].average() EVOKED = evoked EPOCHS, EVOKED # %% CHANNELS = EPOCHS.info['chs'] TIMES = EPOCHS.times # %% x = EVOKED.data x_embedded = TSNE(n_components=2).fit_transform(x) n_clusters = N_CLUSTERS spectral = CLUSTER.SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity='nearest_neighbors') labels = spectral.fit_predict(x) fig, ax = plt.subplots() for label in np.unique(labels): ax.scatter(x_embedded[labels == label, 0], x_embedded[labels == label, 1]) FIGURES.append(fig) times = np.array([0.2, 0.3, 0.4, 0.5, 0.6]) evoked_labels = EVOKED.copy() evoked_labels.data = evoked_labels.data * 0 for label in np.unique(labels): print(label)
from pandas.core.frame import DataFrame import numpy as np from sklearn import preprocessing # 先把資料讀進來 data = pd.read_csv('./data.csv').values[:,1:] test_data = pd.read_csv('./test.csv').values[:,1:] # 把數字正規化 scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) data = scaler.fit_transform(data) data = pd.DataFrame(data) # 使用四種不同的方式來分類,然後記錄下來 labels = [] labels.append(cluster.SpectralClustering(n_clusters = 6 ,random_state = 1, affinity='rbf', gamma = 0.3, n_init=100).fit(data).labels_) labels.append(cluster.AgglomerativeClustering(n_clusters = 5, linkage = 'ward', compute_full_tree= True).fit(data).labels_) labels.append(cluster.KMeans(n_clusters = 3, init = 'k-means++', n_init = 100, max_iter = 30000, tol = 1e-4, random_state=1, precompute_distances=True).fit(data).labels_) labels.append(cluster.MiniBatchKMeans(n_clusters = 6, n_init = 100).fit(data).labels_) # 最後來投票,只要大於等於2的就算同類 ans = np.zeros((len(test_data), len(labels))) for i in range(len(test_data)): for j in range(len(labels)): if(labels[j][test_data[i][0]] == labels[j][test_data[i][1]]): ans[i][j] = 1 ans = np.array(ans) ans = np.sum(ans, axis = 1) ans = (ans) > 1 # 輸出結果
for element in context_name_ent: avg_ent = [] list_avg = element + named_entity[i][3:].split(" ") for item in list_avg: try: avg_ent.append(model[item]) except: pass if avg_ent != []: avg_ent = np.array(avg_ent) avg_ent = np.mean(avg_ent, axis = 0) entity_embedding.append(avg_ent) for element in list_avg: thefile.write("%s\t" % element.encode('utf-8')) thefile.write("%s\n" % named_entity[i][:3].encode('utf-8')) i +=1 entity_embedding =np.array(entity_embedding) print(len(named_entity_f)) #Spectral CLustering pickle.dump(named_entity_f, open("true_labels.p", "wb")) print("starting spectral") spectral = cluster.SpectralClustering(n_clusters=10, eigen_solver='arpack', n_init=1)#, affinity="nearest_neighbors" spectral.fit(entity_embedding) print spectral.labels_ pickle.dump(spectral.labels_, open("predicted_labels.p", "wb")) #labels = pickle.load(open("labels.p", "rb"))
i = 0 for line in f: similarityMatrix[i] = line.split(",")[:-1] i += 1 # Make the matrix symmetric print("Making matrix symmetric") for i in range(len(ciks)): similarityMatrix[i][i] = 1 for j in range(i): similarityMatrix[i][j] = similarityMatrix[j][i] # TODO Clustering print("Clustering") mat = np.matrix(similarityMatrix).astype(np.float64) #eigen_values, eigen_vectors = np.linalg.eigh(mat) #result = cluster.KMeans(n_clusters=200, init='k-means++').fit_predict(eigen_vectors[:, 2:4]) #result = cluster.DBSCAN().fit_predict(mat) result = cluster.SpectralClustering(300).fit_predict(mat) print(result) with open(join(inpath, "ClusterResult.txt"), 'w') as f: for cik in ciks: f.write("%s," % (cik)) f.write("\n") for i in range(len(result)): f.write("%s," % (result[i])) f.write("\n")
def spectral(X): return cluster.SpectralClustering( n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors").fit_predict(X)