labels=np.full((n,),-1) for current in reach_distIds: # 正常来说:current的值的值应该比pre的值多一个索引。如果大于一个索引就说明不是一个类别 if(current-pre!=1): # 类别+1 clusterId=clusterId+1 labels[orders[current]]=clusterId pre=current return labels X = np.array(X) orders,reach_dists=OPTICS(X,np.inf,O1) plotReachability(reach_dists[orders],1) labels=extract_dbscan(X,orders,reach_dists,O2) plotFeature(X,labels) elif suanfa=="BIRCH": labels = Birch(n_clusters = B1, threshold=B2, branching_factor=B3).fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels) plt.title=("BIRTCH Clusters") plt.show() print("CH指标:", metrics.calinski_harabaz_score(X, labels)) elif suanfa=="KMeans": kmeans = KMeans(n_clusters=K, max_iter=300, n_init=10, init='k-means++', random_state=0) labels = kmeans.fit_predict(X) y_kmeans=labels for i in range(1,len(labels)): if y_kmeans[i] == 0: plt.scatter(X[i, 0], X[i, 1], s=15, c='red') elif y_kmeans[i] == 1: plt.scatter(X[i, 0], X[i, 1], s=15, c='blue') elif y_kmeans[i] == 2: plt.scatter(X[i, 0], X[i, 1], s=15, c='green')
def test_birch_params_validation(params, err_type, err_msg): """Check the parameters validation in `Birch`.""" X, _ = make_blobs(n_samples=80, centers=4) with pytest.raises(err_type, match=err_msg): Birch(**params).fit(X)
rightlist = Counter((x * y)[x * y != 0]).most_common(min(xdivnum, ydivnum)) right = sum(np.array(rightlist)[:, 1]) all = len(x) return right / all DataMat = loadDataSet('five_cluster.txt') cluster_num = 5 X = DataMat[:, [1, 2]] g_truth = DataMat[:, 0] # for 'five_cluser.txt':threshold=1.5,branching_factor=20 # for 'spiral.txt':不适用 # for 'ThreeCircles.txt':不适用 # for 'Twomoons.txt':不适用 t0 = time.time() y_pred = Birch(n_clusters=cluster_num, threshold=1.5, branching_factor=20).fit_predict(X) t1 = time.time() plt.subplot(211) plt.suptitle('Clustering by BIRCH', fontsize=16) plt.scatter(X[:, 0], X[:, 1], s=2, c=np.transpose(g_truth)) plt.subplot(212) plt.scatter(X[:, 0], X[:, 1], s=2, c=y_pred) plt.show() ''' result_accuracy=Coopcheckdiv(a,b) print('Accuracy Rate Is:') print(result_accuracy) ''' print('Processing Time Is:') print(t1 - t0)
y = df2.Decision_Accept #K-mean clustering kmeans = KMeans(n_clusters=2, random_state=10) kmeans.fit(X) #kmeans.cluster_centers_ #kmeans.inertia_ labels_km = kmeans.labels_ #Agglomerative clustering agg = AgglomerativeClustering(n_clusters=2) agg.fit(X) labels_agg = agg.labels_ #Birch clustering bi = Birch(threshold=0.01, n_clusters=2) bi.fit(X) labels_bi = bi.labels_ correct_labels_km = sum(y == labels_km) correct_labels_agg = sum(y == labels_agg) correct_labels_bi = sum(y == labels_bi) performance2.loc[len(performance2)] = [ 'kmean', d, round(silhouette_score(X, labels_km), 2), round((correct_labels_km / y.size), 2) ] performance2.loc[len(performance2)] = [ 'Agglomerative', d, round(silhouette_score(X, labels_agg), 2),
def clust_2D_pixels(pixels_df, threshold_cluster=2): ''' Group significant pixels by proximity using Birch clustering. Parameters ---------- pixels_df : pandas.DataFrame a DataFrame of 2 columns, with left-one being the column index of a pixel and the right-one being row index of a pixel threshold_cluster : int clustering radius for Birch clustering derived from ~40kb radius of clustering and bin size. Returns ------- peak_tmp : pandas.DataFrame DataFrame with c_row,c_col,c_label,c_size - columns. row/col are coordinates of centroids, label and sizes are unique pixel-cluster labels and their corresponding sizes. Notes ----- TODO: figure out Birch clustering CFNodes etc, check if there might be some empty subclusters. ''' pixels = pixels_df.values pix_idx = pixels_df.index # clustering object prepare: brc = Birch(n_clusters=None, threshold=threshold_cluster) # cluster selected pixels ... brc.fit(pixels) brc.predict(pixels) # array of labels assigned to each pixel # after clustering: brc.labels_ # array of (tuples?) with X,Y coordinates # for centroids of corresponding clusters: # brc.subcluster_centers_ uniq_labels, inverse_idx, uniq_counts = np.unique(brc.labels_, return_inverse=True, return_counts=True) # cluster sizes taken to match labels: clust_sizes = uniq_counts[inverse_idx] #################### # After discovering a bug ... # bug (or misunderstanding, rather): # uniq_labels is a subset of brc.subcluster_labels_ # TODO: dive deeper into Birch ... #################### # repeat centroids coordinates # as many times as there are pixels # in each cluster: # IN OTHER WORDS (after bug fix): # take centroids corresponding to labels: centroids = np.take(brc.subcluster_centers_, brc.labels_, axis=0) # small message: print("Clustering is completed:\n" + "there are {} clusters detected\n".format(uniq_counts.size) + "mean size {:.6f}+/-{:.6f}\n".format(uniq_counts.mean(), uniq_counts.std()) + "labels and centroids to be reported.") # let's create output DataFrame peak_tmp = pd.DataFrame(centroids, index=pix_idx, columns=['c_row', 'c_col']) # add labels: peak_tmp['c_label'] = brc.labels_.astype(np.int) # add cluster sizes: peak_tmp['c_size'] = clust_sizes.astype(np.int) return peak_tmp
def Dbscan(): weight = joblib.load('result/weight.pkl') # print(weight) fw = open('result/result.txt', 'a', encoding='utf-8') fw.write('Birch') fw.write('\n') clf = Birch(n_clusters=89) time_start = time.time() s = clf.fit(weight) time_run = time.time() - time_start print(s) # 每个样本所属的类别 labels = clf.labels_ print(labels) print(len(set(labels))) pred_label = [] # 存储2742个文本的预测标签 a = {} # key:类别标签,value:此类的所有文档 i = 1 while i <= len(clf.labels_): # print(i, clf.labels_[i - 1]) # 第几个文本,对应的类别标签 pred_label.append(clf.labels_[i - 1]) if clf.labels_[i - 1] not in a.keys(): a[clf.labels_[i - 1]] = [] else: a[clf.labels_[i - 1]].append(i) i = i + 1 print(a) print('pred_lable:', pred_label) true_lable = [] file = open('data/Tweets_cluster.txt', 'r') for line in file: line = line.strip('\n') true_lable.append(int(line)) print('true_lable:', true_lable) # 性能评估:NMI(标准化互信息) nmi = metrics.normalized_mutual_info_score(true_lable, pred_label) print('NMI值为:%f' % (nmi)) # 结果越相似NMI值应接近1;算法结果很差则NMI值接近0 print('运行时间:', time_run) fw.write('\t') fw.write('NMI值为:' + str(nmi) + ' ' + '运行时间:' + str(time_run)) localtime = time.localtime( time.time()) # time.localtime()方法,作用是格式化时间戳为本地的时间 time_format = time.strftime('%Y-%m-%d %H:%M:%S', localtime) # 格式化制定形式 fw.write(' ' + '本地时间:' + time_format) fw.write('\n') fw.close() #### 降维 #### pca = PCA(n_components=2) # 降维两维 print('pca:', pca) new_weight = pca.fit_transform(weight) # 重新计算成二维形式 print('newWeight:', new_weight) # 绘制散点图(scatter),横轴为x,获取的第1列数据;纵轴为y,获取的第2列数据;c=y_pred对聚类的预测结果画出散点图,marker='o'说明用点表示图形。 plt.scatter(new_weight[:, 0], new_weight[:, 1], c=pred_label, marker='o') plt.title('Birch') plt.show()
def get_plot(self, parameters): qe = QueryExecutor() query = self.query.format( start=parameters['range'].value[0], end=parameters['range'].value[1], ) df = qe.get_result_dataframe(query) countries_to_leave = Utils().get_valid_fips_countries(25000) df = df[(df['ActorGeo'].isin(countries_to_leave))] multi_index = pd.MultiIndex.from_product( [ df['ActorGeo'].unique(), df['Type1'].unique(), df['Type2'].unique() ], names=['ActorGeo', 'Type1', 'Type2']) df.index = pd.MultiIndex.from_arrays( [df['ActorGeo'], df['Type1'], df['Type2']], names=['ActorGeo', 'Type1', 'Type2']) df.drop(['ActorGeo', 'Type1', 'Type2'], axis=1, inplace=True) df = df.reindex(multi_index).reset_index() df.fillna(0., inplace=True) df.sort_values(['ActorGeo', 'Type1', 'Type2'], inplace=True) df.index = np.arange(df.shape[0]) df = df[(df.Type1 != 0.0) & df.Type2 != 0.0] types_no = np.unique(df.groupby('ActorGeo')['AvgTone'].count())[0] data = df['AvgTone'].values.reshape((-1, types_no)) labels = df['ActorGeo'].unique() norm_data = (data - np.mean(data, axis=1)[:, np.newaxis]) / np.std( data, axis=1)[:, np.newaxis] n_clusters = parameters['n_clusters'].value if parameters['method'].value == 'agglomerative': model = AgglomerativeClustering( n_clusters=n_clusters, affinity='precomputed', linkage='complete', compute_full_tree=True, ) elif parameters['method'].value == 'britch': model = Birch(branching_factor=5, n_clusters=n_clusters) elif parameters['method'].value == 'kmeans': model = KMeans(n_clusters=n_clusters) elif parameters['method'].value == 'affinity_prop': model = AffinityPropagation() clusters = model.fit_predict(norm_data) cluster_df = pd.DataFrame({ 'country_id': labels, 'cluster_id': clusters }) cluster_df = cluster_df.join( Utils().get_fips_iso_mapping(), on=['country_id'], how='right', ).fillna(-1) cluster_df.rename({'ISO': 'country_iso'}, axis=1, inplace=True) cluster_df['country_name'] = cluster_df['country_id'].map( Utils().get_fips_country_id_to_name_mapping()) # Uncomment to write result to csv # cluster_df.groupby('cluster_id')['country_name'].apply(lambda countries: '; '.join(countries)).to_csv('clustering_result.csv') fig = px.choropleth( cluster_df, locations='country_iso', locationmode='ISO-3', color='cluster_id', hover_name='country_name', hover_data=['cluster_id'], labels={ 'country_name': 'Country Name', 'cluster_id': 'Cluster ID' }, color_continuous_scale=px.colors.qualitative.Alphabet, ) return plot(fig, include_plotlyjs=True, output_type='div')
def birch(self): #acc is 0.87 self.model = Birch(n_clusters=self.n_clusters)
def Birch_Cluster(np_data, num_clusters): """ Perform birch clustering and return labels """ birch = Birch() return birch.fit_predict(np_data, num_clusters)
def combination_algorithm(AMDs_train, energy_train, AMDs_test, energy_test, type): NUMBER_OF_CLUSTER = 5 if type == "kmeans_com": model = KMeans(n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train) y_clusters = model.predict(AMDs_test) elif type == "affinity_com": model = AffinityPropagation(damping=0.9, random_state=5).fit(AMDs_train) y_clusters = model.predict(AMDs_test) elif type == "agglomerative_com": model = AgglomerativeClustering(n_clusters=NUMBER_OF_CLUSTER) y_clusters = model.fit_predict(AMDs_test) elif type == "birch_com": model = Birch(threshold=0.1, n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train) y_clusters = model.predict(AMDs_test) elif type == "minibatch_com": model = MiniBatchKMeans(n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train) y_clusters = model.predict(AMDs_test) elif type == "meanshift_com": model = MeanShift().fit(AMDs_train) y_clusters = model.predict(AMDs_test) else: return new_energy = [] new_energy_test = [] for i in range(NUMBER_OF_CLUSTER): if i not in y_clusters: print("ERROR: ", i, " is not here") continue index = 0 temp_AMDs = [] temp_energy = [] for j in model.labels_: if i == j: temp_AMDs.append(AMDs_train[index]) temp_energy.append(energy_train[index]) index += 1 index = 0 temp_AMDs_test = [] temp_energy_test = [] for j in y_clusters: if i == j: temp_AMDs_test.append(AMDs_test[index]) temp_energy_test.append(energy_test[index]) index += 1 quadratic_featurizer = PolynomialFeatures(degree=1, interaction_only=True) X_train_quadratic = quadratic_featurizer.fit_transform(temp_AMDs) X_test_quadratic = quadratic_featurizer.fit_transform(temp_AMDs_test) model2 = LinearRegression() model2.fit(X_train_quadratic, temp_energy) temp_energy_pred = model2.predict(X_test_quadratic) new_energy.extend(temp_energy_pred) new_energy_test.extend(temp_energy_test) fig, ax = plt.subplots() ax.scatter(temp_energy_test, temp_energy_pred) ax.plot([np.min(temp_energy_test), np.max(temp_energy_test)], [np.min(temp_energy_test), np.max(temp_energy_test)], 'k--', lw=4) ax.set_xlabel('Given') ax.set_ylabel('Predicted') plt.savefig('./image/combination_algorithm' + str(i) + '.jpg') fig, ax = plt.subplots() print("R^2 score of the combination algorithm is: ", r2_score(new_energy_test, new_energy)) print("RMSE of the combination algorithm is: ", math.sqrt(mean_squared_error(new_energy_test, new_energy))) ax.scatter(new_energy_test, new_energy) ax.plot([np.min(new_energy_test), np.max(new_energy_test)], [np.min(new_energy_test), np.max(new_energy_test)], 'k--', lw=4) ax.set_xlabel('Given') ax.set_ylabel('Predicted') plt.savefig('./image/combination_algorithm.jpg')
if cluster_method == 'KMeans': cluster_algo = KMeans(n_clusters=num_clusters) elif cluster_method == 'AffinityPropagation': cluster_algo = AffinityPropagation() elif cluster_method == 'MeanShift': cluster_algo = MeanShift() elif cluster_method == 'SpectralClustering': cluster_algo = SpectralClustering(n_clusters=num_clusters) elif cluster_method == 'AgglomerativeClustering': cluster_algo = AgglomerativeClustering(n_clusters=num_clusters) elif cluster_method == 'DBSCAN': cluster_algo = DBSCAN() elif cluster_method == 'OPTICS': cluster_algo = OPTICS() elif cluster_method == 'GaussianMixture': cluster_algo = GaussianMixture() elif cluster_method == 'Birch': cluster_algo = Birch(n_clusters=num_clusters) data = pd.read_csv(data_path, index_col=None) columns = pd.read_csv(columns_path, index_col=None) columns = columns['Columns'].values columns = np.append(['MSA', 'msa name'], columns).copy() data_2 = data[columns] cluster_out = cluster_algo.fit_predict(data_2.iloc[:, 2:]) result = pd.DataFrame(data[['MSA', 'msa name']]) result['cluster_number'] = cluster_out result.to_csv('clusters.csv')
plt.figure() # convert sequence to array docvecs = [] for num in range(len(model.docvecs)): # print(num) # print(model.docvecs[num]) docvecs.append(np.array(model.docvecs[num])) for branching in Parameter.branching_factor: silhouette_scores = [] calinski_scores = [] for thres in Parameter.threshold: Birch_model = Birch(branching_factor=branching, n_clusters=None, threshold=thres, compute_labels=True).fit(docvecs) labels = Birch_model.labels_ silhouette_scores.append(metrics.silhouette_score(docvecs, labels)) calinski_scores.append( metrics.calinski_harabaz_score(docvecs, labels)) plt.subplot(1, 2, 1) plt.plot(Parameter.threshold, silhouette_scores, label=str(branching)) plt.legend() plt.title("silhouette_scores") plt.subplot(1, 2, 2) plt.plot(Parameter.threshold, calinski_scores, label=str(branching)) plt.legend()
def birch(tfidf_matrix): b_cluster = Birch(n_clusters=90, threshold=0.7) result = b_cluster.fit_predict(tfidf_matrix) return result
print 'Kmeans' t0 = tl.time() kmeans = KMeans(n_clusters=6, random_state=0).fit(X_train) t1 = tl.time() print 'Training Time', round(t1 - t0, 3), 's' t0 = tl.time() pred = kmeans.predict( X_test[:10000, :]) #Predicting for one dataspace with 10000 records t1 = tl.time() print 'Validation Time', round(t1 - t0, 3), 's' print "-----------*----------------" from sklearn.cluster import Birch print 'Birch' t0 = tl.time() birch = Birch(branching_factor=50, n_clusters=None, threshold=0.5, compute_labels=True).fit(X_train) t1 = tl.time() print 'Training Time', round(t1 - t0, 3), 's' t0 = tl.time() pred = birch.predict( X_test[:10000, :]) #Predicting for one dataspace with 10000 records t1 = tl.time() print 'Validation Time', round(t1 - t0, 3), 's' print "-----------*----------------" from sklearn import mixture print 'Gaussian' t0 = tl.time() gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X_train) t1 = tl.time()
def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=[ 'Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number' ], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs): self.decay_window = decay_window self.decay_alpha = decay_alpha if similarity == 'cosine': # very, very slow :( self.vectorizer = DictVectorizer() self.uniform_sim = self._sim_cosine elif similarity == 'jaccard': self.uniform_sim = self._sim_jaccard elif similarity == 'normalized_cooccurrence': self.uniform_sim = self._sim_normalized_cooccurrence else: raise LexRankError( "available similarity functions are: cosine, jaccard, normalized_cooccurrence" ) self.sim = lambda sentence1, sentence2: self.decay( sentence1, sentence2) * self.uniform_sim(sentence1, sentence2) self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs) if clustering == 'birch': self._birch = Birch(threshold=0.99, n_clusters=n_clusters) self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix ) elif clustering == 'dbscan': self._dbscan = DBSCAN() self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix) elif clustering == 'affinity': self._affinity = AffinityPropagation() self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix) elif clustering is None: self._clusterer = lambda matrix: [ 0 for index in range(matrix.shape[0]) ] else: raise LexRankError( "available clustering algorithms are: birch, markov, no-clustering(use `None`)" ) self.no_below_word_count = no_below_word_count self.no_above_word_portion = no_above_word_portion self.max_dictionary_size = max_dictionary_size self.similarity_threshold = similarity_threshold self.min_cluster_size = min_cluster_size self.matrix_smoothing = matrix_smoothing self.compactify = compactify
def main(args=None): args = parse_arguments().parse_args(args) # if args.threads <= 4: # log.error('') # exit(1) outputFolder = os.path.dirname(os.path.abspath(args.outFileName)) + '/' raw_file_name = os.path.splitext(os.path.basename(args.outFileName))[0] if args.numberOfNearestNeighbors is None: cooler_obj = cooler.Cooler(args.matrix) args.numberOfNearestNeighbors = int(cooler_obj.info['ncells']) if args.cell_coloring_type: cell_name_cell_type_dict = {} cell_type_color_dict = {} color_cell_type_dict = {} cell_type_counter = 0 with open(args.cell_coloring_type, 'r') as file: for i, line in enumerate(file.readlines()): line = line.strip() try: cell_name, cell_type = line.split('\t') except Exception: cell_name, cell_type = line.split(' ') cell_name_cell_type_dict[cell_name] = cell_type if cell_type not in cell_type_color_dict: cell_type_color_dict[cell_type] = cell_type_counter color_cell_type_dict[cell_type_counter] = cell_type cell_type_counter += 1 if args.cell_coloring_batch: cell_name_cell_type_dict_batch = {} cell_type_color_dict_batch = {} color_cell_type_dict_batch = {} cell_type_counter_batch = 0 with open(args.cell_coloring_batch, 'r') as file: for i, line in enumerate(file.readlines()): line = line.strip() try: cell_name, cell_type = line.split('\t') except Exception: cell_name, cell_type = line.split(' ') cell_name_cell_type_dict_batch[cell_name] = cell_type if cell_type not in cell_type_color_dict_batch: cell_type_color_dict_batch[cell_type] = cell_type_counter_batch color_cell_type_dict_batch[cell_type_counter_batch] = cell_type cell_type_counter_batch += 1 if args.clusterMethod == 'spectral': cluster_object = SpectralClustering(n_clusters=args.numberOfClusters, affinity='nearest_neighbors', n_jobs=args.threads, random_state=0) elif args.clusterMethod == 'kmeans': cluster_object = KMeans(n_clusters=args.numberOfClusters, random_state=0, n_jobs=args.threads, precompute_distances=True) elif args.clusterMethod.startswith('agglomerative'): for linkage in ['ward', 'complete', 'average', 'single']: if linkage in args.clusterMethod: cluster_object = AgglomerativeClustering(n_clusters=args.numberOfClusters, linkage=linkage) break elif args.clusterMethod == 'birch': cluster_object = Birch(n_clusters=args.numberOfClusters) else: log.error('No valid cluster method given: {}'.format(args.clusterMethod)) umap_params_dict = {} if not args.noUMAP: for param in vars(args): if 'umap_' in param: umap_params_dict[param] = vars(args)[param] umap_params_dict['umap_random'] = 42 # log.debug(umap_params_dict) if args.saveMemory: matrices_list = cell_name_list(args.matrix) max_nnz = 0 for matrix in matrices_list: cooler_obj = cooler.Cooler(args.matrix + '::' + matrix) nnz = cooler_obj.info['nnz'] if max_nnz < nnz: max_nnz = nnz minHash_object = None matricesPerRun = int(len(matrices_list) * args.shareOfMatrixToBeTransferred) if matricesPerRun < 1: matricesPerRun = 1 chromosome_indices = None if args.intraChromosomalContactsOnly: cooler_obj = cooler.Cooler(args.matrix + '::' + matrices_list[0]) binsDataFrame = cooler_obj.bins()[:] chromosome_indices = {} for chromosome in cooler_obj.chromnames: chromosome_indices[chromosome] = np.array(binsDataFrame.index[binsDataFrame['chrom'] == chromosome].tolist()) for j, i in enumerate(range(0, len(matrices_list), matricesPerRun)): if i < len(matrices_list) - 1: matrices_share = matrices_list[i:i + matricesPerRun] else: matrices_share = matrices_list[i:] neighborhood_matrix, matrices_list_share = open_and_store_matrix(args.matrix, matrices_share, 0, len(matrices_share), args.chromosomes, args.intraChromosomalContactsOnly, chromosome_indices) if minHash_object is None: minHash_object = MinHash(n_neighbors=args.numberOfNearestNeighbors, number_of_hash_functions=args.numberOfHashFunctions, number_of_cores=args.threads, shingle_size=0, fast=args.euclideanModeMinHash, maxFeatures=int(max_nnz), absolute_numbers=False) if j == 0: minHash_object.fit(neighborhood_matrix) else: minHash_object.partial_fit(X=neighborhood_matrix) precomputed_graph = minHash_object.kneighbors_graph(mode='distance') precomputed_graph = np.nan_to_num(precomputed_graph) precomputed_graph.data[np.isinf(precomputed_graph.data)] = 0 if not args.noPCA: pca = PCA(n_components=min(precomputed_graph.shape) - 1) precomputed_graph = np.nan_to_num(precomputed_graph.todense()) precomputed_graph[np.isinf(precomputed_graph)] = 0 precomputed_graph = pca.fit_transform(precomputed_graph) if args.dimensionsPCA: args.dimensionsPCA = min(args.dimensionsPCA, precomputed_graph.shape[0]) precomputed_graph = precomputed_graph[:, :args.dimensionsPCA] # cluster_object.fit(precomputed_graph[:, :args.dimensionsPCA]) if not args.noUMAP: if umap_params_dict is None: reducer = umap.UMAP() else: reducer = umap.UMAP(n_neighbors=umap_params_dict['umap_n_neighbors'], n_components=umap_params_dict['umap_n_components'], metric=umap_params_dict['umap_metric'], n_epochs=umap_params_dict['umap_n_epochs'], learning_rate=umap_params_dict['umap_learning_rate'], init=umap_params_dict['umap_init'], min_dist=umap_params_dict['umap_min_dist'], spread=umap_params_dict['umap_spread'], set_op_mix_ratio=umap_params_dict['umap_set_op_mix_ratio'], local_connectivity=umap_params_dict['umap_local_connectivity'], repulsion_strength=umap_params_dict['umap_repulsion_strength'], negative_sample_rate=umap_params_dict['umap_negative_sample_rate'], transform_queue_size=umap_params_dict['umap_transform_queue_size'], a=umap_params_dict['umap_a'], b=umap_params_dict['umap_b'], angular_rp_forest=umap_params_dict['umap_angular_rp_forest'], target_n_neighbors=umap_params_dict['umap_target_n_neighbors'], target_metric=umap_params_dict['umap_target_metric'], target_weight=umap_params_dict['umap_target_weight'], random_state=umap_params_dict['umap_random'], force_approximation_algorithm=umap_params_dict['umap_force_approximation_algorithm'], verbose=umap_params_dict['umap_verbose'], unique=umap_params_dict['umap_unique']) precomputed_graph = reducer.fit_transform(precomputed_graph) precomputed_graph = np.nan_to_num(precomputed_graph) precomputed_graph[np.isinf(precomputed_graph)] = 0 try: cluster_object.fit(precomputed_graph) except Exception: cluster_object.fit(precomputed_graph.todense()) minHashClustering = MinHashClustering(minHashObject=minHash_object, clusteringObject=cluster_object) minHashClustering._precomputed_graph = precomputed_graph else: neighborhood_matrix, matrices_list = create_csr_matrix_all_cells(args.matrix, args.threads, args.chromosomes, outputFolder, raw_file_name, args.intraChromosomalContactsOnly, pDistance=args.distance) if args.saveIntermediateRawMatrix: save_npz(args.saveIntermediateRawMatrix, neighborhood_matrix) if not args.saveMemory: minHash_object = MinHash(n_neighbors=args.numberOfNearestNeighbors, number_of_hash_functions=args.numberOfHashFunctions, number_of_cores=args.threads, shingle_size=5, fast=args.euclideanModeMinHash, maxFeatures=int(max(neighborhood_matrix.getnnz(1))), absolute_numbers=False, max_bin_size=100000, minimal_blocks_in_common=100, excess_factor=1, prune_inverse_index=False) minHashClustering = MinHashClustering(minHashObject=minHash_object, clusteringObject=cluster_object) minHashClustering.fit(X=neighborhood_matrix, pSaveMemory=args.shareOfMatrixToBeTransferred, pPca=(not args.noPCA), pPcaDimensions=args.dimensionsPCA, pUmap=(not args.noUMAP), pUmapDict=umap_params_dict) if args.noPCA and args.noUMAP: mask = np.isnan(minHashClustering._precomputed_graph.data) minHashClustering._precomputed_graph.data[mask] = 0 mask = np.isinf(minHashClustering._precomputed_graph.data) minHashClustering._precomputed_graph.data[mask] = 0 labels_clustering = minHashClustering.predict(minHashClustering._precomputed_graph, pPca=args.noPCA, pPcaDimensions=args.dimensionsPCA) if args.createScatterPlot: if args.noPCA and args.noUMAP: pca = PCA(n_components=min(minHashClustering._precomputed_graph.shape) - 1) neighborhood_matrix_knn = pca.fit_transform(minHashClustering._precomputed_graph.todense()) else: neighborhood_matrix_knn = minHashClustering._precomputed_graph list(set(labels_clustering)) colors = process_cmap(args.colorMap) try: neighborhood_matrix_knn = neighborhood_matrix_knn.toarray() except Exception: pass label_x = 'PC1' label_y = 'PC2' if not (args.noUMAP): label_x = 'UMAP1' label_y = 'UMAP2' if args.cell_coloring_type: if len(colors) < len(cell_type_color_dict): log.error('The chosen colormap offers too less values for the number of clusters.') exit(1) labels_clustering_cell_type = [] for cell_name in matrices_list: labels_clustering_cell_type.append(cell_type_color_dict[cell_name_cell_type_dict[cell_name]]) labels_clustering_cell_type = np.array(labels_clustering_cell_type) log.debug('labels_clustering_cell_type: {}'.format(len(labels_clustering_cell_type))) log.debug('matrices_list: {}'.format(len(matrices_list))) plt.figure(figsize=(args.figuresize[0], args.figuresize[1])) for i, color in enumerate(colors[:len(cell_type_color_dict)]): mask = labels_clustering_cell_type == i log.debug('plot cluster: {} {}'.format(color_cell_type_dict[i], np.sum(mask))) plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(color_cell_type_dict[i]), s=20, alpha=0.7) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize) plt.xticks([]) plt.yticks([]) plt.xlabel(label_x, fontsize=args.fontsize) plt.ylabel(label_y, fontsize=args.fontsize) if '.' not in args.createScatterPlot: args.createScatterPlot += '.png' scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '_cell_color.' + args.createScatterPlot.split('.')[-1] plt.tight_layout() plt.savefig(scatter_plot_name, dpi=args.dpi) plt.close() # compute overlap of cell_type find found clusters computed_clusters = set(labels_clustering) cell_type_amounts_dict = {} percentage_threshold = 0.8 if args.latexTable: for threshold in [0.7, 0.8, 0.9]: cell_type_amounts_dict[threshold] = {} with open(args.latexTable, 'w') as matches_file: header = '\\begin{table}[!htb]\n\\footnotesize\n\\begin{tabular}{|l' body = '\\hline Cluster ' for i in range(len(color_cell_type_dict)): mask_cell_type = labels_clustering_cell_type == i header += '|c' body += '& ' + str(color_cell_type_dict[i]) + ' (' + str(np.sum(mask_cell_type)) + ' cells)' header += '|}\n' body += '\\\\\n' # body = '' for i in computed_clusters: body += '\\hline Cluster ' + str(i) mask_computed_clusters = labels_clustering == i body += ' (' + str(np.sum(mask_computed_clusters)) + ' cells)' for j in range(len(cell_type_color_dict)): mask_cell_type = labels_clustering_cell_type == j mask = mask_computed_clusters & mask_cell_type number_of_matches = np.sum(mask) body += '& ' + str(number_of_matches) if number_of_matches != 1: body += ' cells / ' else: body += ' cell / ' body += '{:.2f}'.format((number_of_matches / np.sum(mask_computed_clusters)) * 100) + ' \\% ' for threshold in [0.7, 0.8, 0.9]: if number_of_matches / np.sum(mask_computed_clusters) >= threshold: if color_cell_type_dict[j] in cell_type_amounts_dict[threshold]: cell_type_amounts_dict[threshold][color_cell_type_dict[j]] += number_of_matches else: cell_type_amounts_dict[threshold][color_cell_type_dict[j]] = number_of_matches else: if color_cell_type_dict[j] in cell_type_amounts_dict[threshold]: continue else: cell_type_amounts_dict[threshold][color_cell_type_dict[j]] = 0 body += '\\\\\n' body += '\\hline ' + '&' * len(cell_type_color_dict) + '\\\\\n' for threshold in [0.7, 0.8, 0.9]: body += '\\hline Correct identified $>{}\\%$'.format(int(threshold * 100)) for i in range(len(cell_type_color_dict)): mask_cell_type = labels_clustering_cell_type == i if color_cell_type_dict[i] in cell_type_amounts_dict[threshold]: body += '& ' + str(cell_type_amounts_dict[threshold][color_cell_type_dict[i]]) + ' / ' + str(np.sum(mask_cell_type)) + ' (' body += '{:.2f}'.format((cell_type_amounts_dict[threshold][color_cell_type_dict[i]] / np.sum(mask_cell_type)) * 100) else: body += '& ' + str(0) + ' / ' + str(np.sum(mask_cell_type)) + ' (' body += '{:.2f}'.format(0 / np.sum(mask_cell_type)) body += ' \\%)' body += '\\\\\n' body += '\\hline \n' body += '\\end{tabular}\n\\caption{}\n\\end{table}' matches_file.write(header) matches_file.write(body) else: with open('matches.txt', 'w') as matches_file: for i in computed_clusters: mask_computed_clusters = labels_clustering == i for j in range(len(cell_type_color_dict)): mask_cell_type = labels_clustering_cell_type == j mask = mask_computed_clusters & mask_cell_type number_of_matches = np.sum(mask) matches_file.write('Computed cluster {} (size: {}) matching with cell type {} (size: {}) {} times. Rate (matches/computed_clusters): {}%\n'.format( i, np.sum(mask_computed_clusters), color_cell_type_dict[j], np.sum(mask_cell_type), number_of_matches, number_of_matches / np.sum(mask_computed_clusters))) if number_of_matches / np.sum(mask_computed_clusters) >= percentage_threshold: if color_cell_type_dict[j] in cell_type_amounts_dict: cell_type_amounts_dict[color_cell_type_dict[j]] += number_of_matches else: cell_type_amounts_dict[color_cell_type_dict[j]] = number_of_matches matches_file.write('\n') all_detected = 0 all_possible = 0 for i in range(len(cell_type_color_dict)): mask_cell_type = labels_clustering_cell_type == i all_possible += np.sum(mask_cell_type) if color_cell_type_dict[i] in cell_type_amounts_dict: all_detected += cell_type_amounts_dict[color_cell_type_dict[i]] cell_type_amounts_dict[color_cell_type_dict[i]] /= np.sum(mask_cell_type) else: cell_type_amounts_dict[color_cell_type_dict[i]] = 0.0 correct_associated = 0.0 for cell_iterator in cell_type_color_dict: correct_associated += cell_type_amounts_dict[cell_iterator] correct_associated /= len(cell_type_amounts_dict) # all_detected /= all_possible # correct_associated = ((correct_associated*4) + (all_detected)) / 5 # correct_associated = correct_associated with open('correct_associated', 'w') as file: file.write(str(correct_associated)) if args.cell_coloring_batch: if len(colors) < len(cell_type_color_dict_batch): log.error('The chosen colormap offers too less values for the number of clusters.') exit(1) labels_clustering_cell_type_batch = [] for cell_name in matrices_list: labels_clustering_cell_type_batch.append(cell_type_color_dict_batch[cell_name_cell_type_dict_batch[cell_name]]) labels_clustering_cell_type_batch = np.array(labels_clustering_cell_type_batch) log.debug('labels_clustering_cell_type: {}'.format(len(labels_clustering_cell_type_batch))) log.debug('matrices_list: {}'.format(len(matrices_list))) plt.figure(figsize=(args.figuresize[0], args.figuresize[1])) for i, color in enumerate(colors[:len(cell_type_color_dict_batch)]): mask = labels_clustering_cell_type_batch == i log.debug('plot cluster: {} {}'.format(color_cell_type_dict_batch[i], np.sum(mask))) plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(color_cell_type_dict_batch[i]), s=20, alpha=0.7) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize) plt.xticks([]) plt.yticks([]) plt.xlabel(label_x, fontsize=args.fontsize) plt.ylabel(label_y, fontsize=args.fontsize) if '.' not in args.createScatterPlot: args.createScatterPlot += '.png' scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '_cell_color_batch.' + args.createScatterPlot.split('.')[-1] plt.tight_layout() plt.savefig(scatter_plot_name, dpi=args.dpi) plt.close() plt.figure(figsize=(args.figuresize[0], args.figuresize[1])) for i, color in enumerate(colors[:args.numberOfClusters]): mask = labels_clustering == i plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(i), s=20, alpha=0.7) plt.legend(fontsize=args.fontsize) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize) plt.xticks([]) plt.yticks([]) plt.xlabel(label_x, fontsize=args.fontsize) plt.ylabel(label_y, fontsize=args.fontsize) if '.' not in args.createScatterPlot: args.createScatterPlot += '.png' scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '.' + args.createScatterPlot.split('.')[-1] plt.tight_layout() plt.savefig(scatter_plot_name, dpi=args.dpi) plt.close() matrices_cluster = list(zip(matrices_list, labels_clustering)) np.savetxt(args.outFileName, matrices_cluster, fmt="%s")
newdata = pca.fit_transform(tfidf_matrix.toarray()) # 进行kmeans聚类 num_clusters = 5 km = KMeans(n_clusters=num_clusters) start = time.time() result = km.fit_predict(newdata) end = time.time() print("k_means运行的时间是:", end-start) plt.scatter(newdata[:, 0], newdata[:, 1], c=result) plt.show() # 进行dbscan聚类 start = time.time() db = DBSCAN(eps=0.03, min_samples=30).fit_predict(newdata) end = time.time() print("DBscan运行的时间是:", end-start) plt.scatter(newdata[:, 0], newdata[:, 1], c=db) plt.show() # 进行birch聚类 start = time.time() result_birch = Birch(n_clusters=5).fit_predict(newdata) end = time.time() print("birch运行的时间是:", end-start) plt.scatter(newdata[:, 0], newdata[:, 1], c=result_birch) plt.show()
def cluster_birch(dataset): estimator = Birch(branching_factor=5, threshold=0.5, n_clusters=10, compute_labels=True).fit(dataset) infer_results(estimator.predict(dataset), "BIRCH")
def do_birch(ft, nc): return Birch(n_clusters=nc).fit(ft).labels_
import pickle from sklearn.cluster import Birch, AffinityPropagation import os t = 'keypoint' with open('../data/' + t + '_all_bicubic_float.dat', 'rb') as ff: data_all = pickle.load(ff) if not os.path.exists('../data/partQuanBirch/' + t + '/sp1_16_bicubic_float_c5'): os.mkdir('../data/partQuanBirch/' + t + '/sp1_16_bicubic_float_c5') #os.mkdir('../data/partQuanBirch/'+t+'/SuperPixle_1') for i in range(4096): gd = data_all[:, i] data = gd.reshape(-1, 1) brc = Birch(branching_factor=50, n_clusters=5, threshold=0.01, compute_labels=True) brc.fit(data) labels = brc.predict(data) np.save( '../data/partQuanBirch/' + t + '/sp1_16_bicubic_float_c5/' + str(i) + '_labels_sp1.npy', labels) print(str(i) + ' has been done')
import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import Birch from sklearn import metrics from sklearn.datasets.samples_generator import make_blobs # X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共4个簇,簇中心在[-1,-1], [0,0],[1,1], [2,2] X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.3, 0.4, 0.3],random_state =9) plt.scatter(X[:, 0], X[:, 1], marker='o',c=y) plt.show() # 不设置聚类数目的Birch y_pred = Birch(n_clusters = None).fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_pred) plt.show() print("CH指标:", metrics.calinski_harabaz_score(X, y_pred)) # 设置聚类数目的Birch y_pred = Birch(n_clusters = 4).fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_pred) plt.show() print("CH指标:", metrics.calinski_harabaz_score(X, y_pred)) # 尝试多个threshold取值,和多个branching_factor取值 param_grid = {'threshold':[0.5,0.3,0.1],'branching_factor':[50,20,10]} # 定义优化参数字典,字典中的key值必须是分类算法的函数的参数名 for threshold in param_grid['threshold']: for branching_factor in param_grid['branching_factor']: clf = Birch(n_clusters = 4,threshold=threshold,branching_factor=branching_factor)
def main(): # txtTojson() # nltk.download('stopwords') stopwords = nltk.corpus.stopwords.words('english') with open('/Users/mac/Desktop/DM/Tweets.json', 'r') as TweetsFile: content = json.load( TweetsFile) # [{'text': "...", 'cluster': "..."}, {...}, ..] tweets = {} # {0: "...", 1: "...", ...} clusters = {} # {0: 37, 1: 40, ...} # len = 89 for i in range(len(content)): tweets.update({i: content[i]['text']}) clusters.update({i: content[i]['cluster']}) # if content[i]['cluster'] in clusters.keys(): # clusters[content[i]['cluster']].append(i) # else: # clusters[content[i]['cluster']] = [] cluster_num = max(list(clusters.values())) # 110 tweet_num = len(list(tweets.values())) # 2472 # 创建字典 vocab_stem = [] vocab_tokenized = [] for i in tweets: tokens = tokenize_and_stem(tweets[i]) vocab_tokenized.append(tokens) # TF-IDF 将文本转为TF-IDF矩阵。 首先计算文档中的词频,转换为词频矩阵TF;IDF逆文档频率,在某些文档中出现高频但是在语料库中低频的具有较高权重 tfidf_vectorizer = TfidfVectorizer( stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, # ngram_range=(1, 3) ) tfidf_matrix = tfidf_vectorizer.fit_transform(list(tweets.values())) # print(tfidf_matrix.shape) # have ngram_range=(1, 3): (2472, 4448) not have (2472, 29160) # terms是TF-IDF矩阵使用的特征列表,使用TF-IDF矩阵可以运行一系列聚类算法 terms = tfidf_vectorizer.get_feature_names() # dist = 1-cosine_similarity dist = 1 - cosine_similarity(tfidf_matrix) # -----------------------------------------KMeans------------------------------------------------------------ # 使用预定数量的clusters初始化,每个文档分配给一个簇,最小化聚类内的平方和,计算聚类的平均值并将其用作新的聚类质心,重新分配,迭代直到收敛 # 需要多次运行以全局最优,KMeans不易达到全局最优 num_clusters = 89 km = KMeans(n_clusters=num_clusters).fit(tfidf_matrix) km_pre = km.labels_.tolist() # print(km.labels_[100:110]) [75 25 18 85 86 19 88 86 3 37] # km_result = km.fit_predict(tfidf_matrix) # print(km_result) labels_true = [] labels_pred = [] for i in clusters: labels_true.append(clusters[i]) labels_true = sorted(labels_true) for i in km_pre: labels_pred.append(km_pre[i]) labels_pred = sorted(labels_pred) km_score = metrics.normalized_mutual_info_score(labels_true, labels_pred) print('KMeans NMI: ', km_score) # 0.7629953372 X = tfidf_matrix.toarray() ms = MeanShift() ms_pre = ms.fit_predict(X) ms_pre = sorted(ms_pre) ms_score = metrics.normalized_mutual_info_score(labels_true, ms_pre) print('MeanShift NMI: ', ms_score) # 0.7056324482 # -----------------------------------------------------Affinity Propagation---------------------------------------- ap = AffinityPropagation().fit(tfidf_matrix) ap_pre = ap.fit_predict(tfidf_matrix) # [195 272 206 ..., 213 137 109] ap_pre = sorted(ap_pre) ap_score = metrics.normalized_mutual_info_score(labels_true, ap_pre) print('AffinityPropagation NMI: ', ap_score) # 0.775145369374 # --------------------------------------------------Spectral Clustering--------------------------------------------- spc = SpectralClustering().fit(tfidf_matrix) # spc_pre = spc.fit_predict(tfidf_matrix) spc_pre = spc.labels_.tolist() spc_pre = sorted(spc_pre) spc_score = metrics.normalized_mutual_info_score(labels_true, spc_pre) print('SpectralClustering NMI: ', spc_score) # 0.47384412442 # -------------------------------------------------Ward Hierarchical clustering------------------------------------- ward_hc = AgglomerativeClustering(n_clusters=89, linkage='ward') X = tfidf_matrix.toarray() ward_hc.fit(X) ward_hc_pre = ward_hc.labels_.tolist() ward_hc_pre = sorted(ward_hc_pre) ward_hc_score = metrics.normalized_mutual_info_score( labels_true, ward_hc_pre) print('Ward Hierarchical clustering NMI: ', ward_hc_score) # 0.759773200943 # ------------------------------------------------- AgglomerativeClustering----------------------------------------- hc = AgglomerativeClustering(n_clusters=89) X = tfidf_matrix.toarray() hc.fit(X) hc_pre = hc.labels_.tolist() hc_pre = sorted(hc_pre) hc_score = metrics.normalized_mutual_info_score(labels_true, hc_pre) print('AgglomerativeClustering NMI: ', hc_score) # 0.759773200943 # ----------------------------------------DBSCAN-------------------------------b------------------------------ X = tfidf_matrix.toarray() dbscan_pre = DBSCAN().fit_predict(X) dbscan_pre = sorted(dbscan_pre) dbscan_score = metrics.normalized_mutual_info_score( labels_true, dbscan_pre) print('DBSCAN NMI: ', dbscan_score) # 0.155256389516 # -------------------------------------------Gaussian mixture models------------------------------------------ gm = GaussianMixture(n_components=89) X = tfidf_matrix.toarray() gm.fit(X) gm_pre = gm.predict(X) gm_pre = sorted(gm_pre) gm_score = metrics.normalized_mutual_info_score(labels_true, gm_pre) print('Gaussian mixture models NMI: ', gm_score) # 0.816899648742 # --------------------------------------------Birch------------------------------------------------------------ birch = Birch(n_clusters=89) X = tfidf_matrix.toarray() # birch.fit(X) # birch_pre = birch.labels_.tolist() birch_pre = birch.fit_predict(X) birch_pre = sorted(birch_pre) birch_score = metrics.normalized_mutual_info_score(labels_true, birch_pre) print('Birch NMI: ', birch_score) # 0.780857693264
def runClusterer(clusterer_name,params,data,param_scale='',metricstring=''): #print('S2 runClusterer>>>') from time import time #----------------------------------s1 读取数据 #如果data[0]存储的是字符串,则读出data[0],data[1],即训练数据和标签位置 if isinstance(data[0],str): X,y,size = loadPictureData(data[0],data[1],data[2]) SX = X #如果存储的不是字符串,那就是直接能用的向量,直接存储就行,各分量自动存储 else: X,SX,y,size = data #print('S2 data load done') #----------------------------------s2 参数缩放 # params: (5,10,) param_scale: (1,100,) # ture params : (5,0.1,) # 建议meanshift ,dbsacan eps /10 if param_scale != '': params = list(params) for i in range(0,len(params)): params[i] /= param_scale[i] #s2 选择聚类器 #kmeans 需指定k if clusterer_name == 'kmeans': from sklearn.cluster import KMeans clusterer = KMeans(init='k-means++', n_clusters=int(params[0]), n_init=10) ms = 'sc' elif clusterer_name == 'dbscan': from sklearn.cluster import DBSCAN # 0.5,10 注意!! eps 被缩小一个尺度!!! clusterer = DBSCAN(eps=params[0], min_samples=params[1]) ms = 'sc' #birch 需指定k elif clusterer_name == 'birch': # None,0.5,50 from sklearn.cluster import Birch clusterer = Birch(n_clusters = params[0], threshold = params[1], branching_factor = params[2]) ms = 'sc' #optics elif clusterer_name == 'optics': from sklearn.cluster import OPTICS clusterer = OPTICS(min_samples=int(params[0]))#,xi=params[1],min_cluster_size=params[2]) #OPTICS(min_samples = 10, xi = 0.05, min_cluster_size = 0.05) ms = 'sc' #Spectral 需指定k elif clusterer_name == 'spectral': pass #clusterer = SpectralClustering(n_clusters = params[0], assign_labels = params[1], random_state = params[2]) elif clusterer_name == 'hierarch': from sklearn.cluster import AgglomerativeClustering #clusterer = AgglomerativeClustering(n_clusters=params[0],affinity=params[1],linkage=params[2])#'canberra',linkage='complete') clusterer = AgglomerativeClustering(n_clusters=int(params[0]), affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='average')#, distance_threshold=None) ms = 'sc' elif clusterer_name == 'meanshift': from sklearn.cluster import MeanShift,estimate_bandwidth #0.2,500 bandwidth = estimate_bandwidth(X, quantile=params[0], n_samples=params[1]) clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms = 'sc' else: print('no cluster name specify') import sys sys.exit(0) if metricstring == '': metricstring = ms #s3 正式运行聚类 t0 = time() clusterer.fit(X) t1 = time() infoDict = {'clusterer':clusterer,'clusterer_name':clusterer_name,'params':params,'metricstring':metricstring} # 聚类器,聚类器生成字符串,度量列表字符串 dataDict = {'X':X,'SX':SX,'y':y,'size':size} # 存储数据的字典,三样全 performanceDict = {'time':t1-t0,'clusters_num':max(clusterer.labels_)+1} # 存储表现的字典,先存储时间和聚类数量 clusterer_container = {'info':infoDict ,'data':dataDict,'performance':performanceDict} #print('S4 done.<<<') return clusterer_container
ax.legend() plt.show() # Perform a K-Means clustering km = KMeans(n_clusters=nb_clusters, random_state=1000) Y_pred_km = km.fit_predict(X) print('Adjusted Rand score: {}'.format(adjusted_rand_score(Y, Y_pred_km))) # Perform the online clustering mbkm = MiniBatchKMeans(n_clusters=nb_clusters, batch_size=batch_size, reassignment_ratio=0.001, random_state=1000) birch = Birch(n_clusters=nb_clusters, threshold=0.2, branching_factor=350) scores_mbkm = [] scores_birch = [] for i in range(0, nb_samples, batch_size): X_batch, Y_batch = X[i:i + batch_size], Y[i:i + batch_size] mbkm.partial_fit(X_batch) birch.partial_fit(X_batch) scores_mbkm.append( adjusted_rand_score(Y[:i + batch_size], mbkm.predict(X[:i + batch_size]))) scores_birch.append( adjusted_rand_score(Y[:i + batch_size],
def test_birch_n_clusters_long_int(): # Check that birch supports n_clusters with np.int64 dtype, for instance # coming from np.arange. #16484 X, _ = make_blobs(random_state=0) n_clusters = np.int64(5) Birch(n_clusters=n_clusters).fit(X)
def test_subcluster_dtype(global_dtype): X = make_blobs(n_samples=80, n_features=4, random_state=0)[0].astype(global_dtype, copy=False) brc = Birch(n_clusters=4) assert brc.fit(X).subcluster_centers_.dtype == global_dtype
for row in csv_reader: if line_count == 0: print(f'Column names are {", ".join(row)}') line_count += 1 else: print(row[1], row[2]) words.append(row[2]) etichette.append(row[1]) line_count += 1 print(f'Processed {line_count} lines.') # Create word embeddings word_embeddings = c2v_model.vectorize_words(words) print(word_embeddings) brc = Birch(n_clusters=n_cluster) brc.fit(word_embeddings) labels = brc.predict(word_embeddings) print(labels) generateCSV(labels, etichette) #pca(labels) sys.argv = ['./readClusterDataCopia.py', n_cluster, embedding, 'BIRCH'] exec(open("./readClusterDataCopia.py").read()) print("eseguito")
## 产生模拟数据 xx = np.linspace(-22, 22, 10) yy = np.linspace(-22, 22, 10) xx, yy = np.meshgrid(xx, yy) n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) #产生10万条特征属性是2,类别是100,符合高斯分布的数据集 X, y = make_blobs(n_samples=100000, n_features=2, centers=n_centres, random_state=28) #创建不同的参数(簇直径)Birch层次聚类 birch_models = [ Birch(threshold=1.7, n_clusters=None), #运行的函数 Birch(threshold=0.5, n_clusters=None), Birch(threshold=1.7, n_clusters=100) ] #threshold:簇直径的阈值, branching_factor:大叶子个数 #我们也可以加参数来试一下效果,比如加入分支因子branching_factor,给定不同的参数值,看聚类的结果 ## 画图 final_step = [ u'直径=1.7;n_lusters=None', u'直径=0.5;n_clusters=None', u'直径=1.7;n_lusters=100' ] plt.figure(figsize=(12, 8), facecolor='w') plt.subplots_adjust(left=0.02, right=0.98, bottom=0.1, top=0.9) colors_ = cycle(colors.cnames.keys())
labels = ms.labels_ X["Cluster3"] = labels print(pd.crosstab(X["Cluster3"], X["Target"])) from sklearn.cluster import MeanShift, estimate_bandwidth bandwidth = estimate_bandwidth(_X, quantile=0.05, n_samples=300, n_jobs=-1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(_X) labels = ms.labels_ X["Cluster4"] = labels print(pd.crosstab(X["Cluster4"], X["Target"])) from sklearn.cluster import Birch birch = Birch(n_clusters=40) labels = birch.fit_predict(_X) X["Cluster5"] = labels print(pd.crosstab(X["Cluster5"], X["Target"])) __X = X[[ "Starid", "Cluster1", "Cluster2", "Cluster3", "Cluster4", "Cluster5", "Target" ]] __X["K"] = [ f"{k1}-{k2}-{k3}-{k4}-{k5}" for k1, k2, k3, k4, k5 in zip( __X["Cluster1"], __X["Cluster2"], __X["Cluster3"], __X["Cluster4"], __X["Cluster5"]) ] rule = pd.crosstab(__X["K"], __X["Target"]) print(rule.head(10))
import matplotlib.colors as colors from sklearn.preprocessing import StandardScaler from sklearn.cluster import Birch from sklearn.datasets.samples_generator import make_blobs mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False xx = np.linspace(-22, 22, 10) yy = np.linspace(-22, 22, 10) xx, yy = np.meshgrid(xx, yy) n_centers = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) X, y = make_blobs(n_samples= 1000, n_features=2, centers=n_centers, random_state=28) birch_models = [Birch(threshold=1.7, n_clusters=None), Birch(threshold=0.5, n_clusters=None), Birch(threshold=1.7, n_clusters=100)] final_step = [u'直径=1.7;n_lusters=None',u'直径=0.5;n_clusters=None',u'直径=1.7;n_lusters=100'] plt.figure(figsize=(12, 8), facecolor='w') plt.subplots_adjust(left=0.02, right=0.98, bottom=0.1, top=0.9) colors_ = cycle(colors.cnames.keys()) cm = mpl.colors.ListedColormap(colors.cnames.keys()) for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)): t = time() birch_model.fit(X) time_ = time() - t labels = birch_model.labels_ centroids = birch_model.subcluster_centers_