def outlier_clusters_ward(x, y, skill=None, memory=None): # TODO: incorporate skill data = np.vstack((x, y)).T if len(data) == 0: # uh. print 'clustering: NO cluster members!' cluster_centers = np.array([[-1, -1]]) cluster_labels = [] labels = [] n_clusters = 0 dist_within = np.array([]) elif len(data) == 1: print 'clustering: only 1 data point!' cluster_centers = data cluster_labels = [0] labels = np.array([0]) n_clusters = 1 dist_within = np.array([0]) else: dist_within = 1000 dist_max = 75 n_clusters = 0 n_clusters_max = 10 clusterer = AgglomerativeClustering(n_clusters=n_clusters, memory=memory) # while dist_within > dist_max, keep adding clusters while (dist_within > dist_max) * (n_clusters < n_clusters_max): # iterate n_clusters n_clusters += 1 clusterer.set_params(n_clusters=n_clusters) # cluster labels = clusterer.fit_predict(data) # get cluster_centers cluster_labels = range(n_clusters) cluster_centers = np.array( [np.mean(data[labels == i], axis=0) for i in cluster_labels]) # find dist_within: the maximum pairwise distance inside a cluster dist_within = np.max([ np.max(pairwise_distances(data[labels == i])) for i in cluster_labels ]) dist_within_final = np.array([ np.max(pairwise_distances(data[labels == i])) for i in cluster_labels ]) return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
def outlier_clusters_ward(x, y, skill=None, memory=None): # TODO: incorporate skill data = np.vstack((x, y)).T if len(data) == 0: # uh. print 'clustering: NO cluster members!' cluster_centers = np.array([[-1, -1]]) cluster_labels = [] labels = [] n_clusters = 0 dist_within = np.array([]) elif len(data) == 1: print 'clustering: only 1 data point!' cluster_centers = data cluster_labels = [0] labels = np.array([0]) n_clusters = 1 dist_within = np.array([0]) else: dist_within = 1000 dist_max = 75 n_clusters = 0 n_clusters_max = 10 clusterer = AgglomerativeClustering(n_clusters=n_clusters, memory=memory) # while dist_within > dist_max, keep adding clusters while (dist_within > dist_max) * (n_clusters < n_clusters_max): # iterate n_clusters n_clusters += 1 clusterer.set_params(n_clusters=n_clusters) # cluster labels = clusterer.fit_predict(data) # get cluster_centers cluster_labels = range(n_clusters) cluster_centers = np.array([np.mean(data[labels == i], axis=0) for i in cluster_labels]) # find dist_within: the maximum pairwise distance inside a cluster dist_within = np.max([np.max(pairwise_distances( data[labels == i])) for i in cluster_labels]) dist_within_final = np.array([np.max(pairwise_distances( data[labels == i])) for i in cluster_labels]) return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
print ' '+str(numAS) + '/' + str(nT) + ' physical DOFs selected after thresholding.' maxK = min(maxK,numAS) X = sp.array(E)[idx,:] D = 1.-sp.absolute(sp.dot(X,X.T)) print ' agglomerative clustering:' print ' - initializing clustering tree...', est = AgglomerativeClustering(linkage="complete",affinity='precomputed',compute_full_tree=True,memory='./outputs/'+caseName+'/cache') print ' done.' S = [] print ' - agglomerative clustering for K = 1 to '+str(maxK)+'...', for nK in range(1,maxK+1): #print nK est.set_params(n_clusters = nK) est.fit(D) labels = est.labels_ selec = [] for k in range(nK): idk = sp.nonzero(labels==k)[0] best_k = idx[idk[sp.argmax(trace[idx[idk]])]] selec.append(best_k) S.append(selec) print ' done.' selDir = './outputs/'+caseName+'/DOFSelection' os.system('mkdir -pv '+selDir) print ' write list of clusters...', f = open(selDir+'/selList_'+str(glob_iter)+'.txt', 'w')
def get_subclustering(X, score_threshold, max_clusters=50, min_input_size=10, silhouette_threshold=0.2, regularization_factor=0.01, clusteringcachedir='clusteringcachedir/'): """ Parameters ---------- embedding : score_threshold : max_clusters : (Default value = 10) X : min_input_size : (Default value = 5) silhouette_threshold : (Default value = 0.2) regularization_factor : (Default value = 0.01) clusteringcachedir : (Default value = 'clusteringcachedir/') Returns ------- """ from sklearn.metrics import silhouette_score from sklearn.cluster import AgglomerativeClustering from tqdm import tqdm if X.shape[0] < min_input_size: return np.array([0] * X.shape[0]) else: clustering = AgglomerativeClustering(n_clusters=2, memory=clusteringcachedir, affinity='cosine', compute_full_tree=True, linkage='average') scores = [] minnk = 2 for nk in tqdm(range(minnk, np.min([max_clusters, X.shape[0]]), 1)): clustering.set_params(n_clusters=nk) clustering.fit(X) score = silhouette_score(X, clustering.labels_, metric='cosine', sample_size=None) # sample_size=np.min([5000, X.shape[0]])) scores.append(score) #break print("scores", np.array(scores)) print("ignoring regularization factor") # scores = scores - np.arange(len(scores))*regularization_factor # print("corrected scores",np.array(scores)) if np.max(scores) >= score_threshold: print("Number of clusters:", np.argmax(scores) + minnk) clustering.set_params(n_clusters=np.argmax(scores) + minnk) clustering.fit(X) return clustering.labels_ else: return np.array([0] * X.shape[0])
from sklearn.cluster import AgglomerativeClustering X = np.array(data.iloc[:, 0:8]) clustering = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=3, pooling_func='deprecated').fit(X) Linkage = ["ward", "complete", "average", "single"] j = 0 for K in Linkage: j = j + 1 clustering.set_params(linkage=K) y = clustering.fit_predict(X) fig = plt.figure(figsize=(12, 4), dpi=100) ax1 = fig.add_subplot(1, 2, 1) ax2 = fig.add_subplot(1, 2, 2) ax1.set_xlim([-0.1, 1.0]) vertical_spacing = (j + 1) * 10 # extra spacing to separate the silhouettes ax1.set_ylim([0, len(X) + vertical_spacing]) silhouette_avg = silhouette_score(X, y) print("For K = {}, the average silhouette score is {:.3f}".format( K, silhouette_avg)) sample_silhouette_values = silhouette_samples(X, y) y_lower = 10
'svd__n_components': np.arange(80, 125, 10), 'n_clusters': np.arange(12, 15) } texf_svd = Pipeline([('vect', text.TfidfVectorizer()), ('svd', TruncatedSVD())]) result = [] Aggl = AgglomerativeClustering(compute_full_tree=True, affinity='cosine', linkage='average') for g in list(model_selection.ParameterGrid(params)): print() print(g) n_clusters = g.pop('n_clusters') texf_svd.set_params(**g) svd_result = texf_svd.fit_transform(data) Aggl.set_params(n_clusters=n_clusters, memory=os.getcwd() + "\\tree") labels_pred = Aggl.fit_predict(mark_allzeros(svd_result)) print(labels_pred) count_table = score_data.count_table(text_data.init_num_by_cls, labels_pred, n_clusters) print( "homogeneity score, completeness score, v score:", metrics.homogeneity_completeness_v_measure(text_data.labels_true(), labels_pred)) print( "Adjusted Mutual Information:", metrics.adjusted_mutual_info_score(text_data.labels_true(), labels_pred)) result.append( (g, metrics.adjusted_mutual_info_score(text_data.labels_true(),