def cluster_fps(self): clkg = hcluster.linkage(self.dm,method = 'average') coarse_r = hcluster.fcluster(clkg,0.3,criterion = 'distance') self.coarse_r = coarse_r bcount = np.bincount(coarse_r) knum = len(np.nonzero(bcount > 1)[0]) s = self.density_matrix.shape if False and len(s) >1 and s[0] > 10 and s[1] > 10 and knum < min(s) / 2: (u,s,vt) = la.svds(self.sps_matrixs,k = knum) self.u = u print '============' else: self.result = self.coarse_r return (clkg,clkg) #rankA = npla.matrix_rank(self.sps_matrixs) # if rankA < 3: a = np.matrix(np.diag(s)) * np.matrix(vt) pd = dist.pdist(np.array(a.T),'cosine') pd[np.abs(pd) < 1e-11] = 0 lkg = hcluster.linkage(pd,method = 'average') self.lkg = lkg self.result = hcluster.fcluster(lkg,self.svd_cluster_thr,criterion = 'distance') # self.result = hcluster.fcluster(lkg,1) # self.result = hcluster.fclusterdata(u,0.7,metric = 'cosine', criterion = 'distance',method = 'average') return (lkg,clkg)
def elbow(self, no_plot=False): """Plot within groups variance vs. number of clusters. Elbow criterion could be used to determine number of clusters. """ from scipy.cluster.hierarchy import fcluster import matplotlib.pyplot as plt idx = fcluster(self.Z, len(self.data), criterion='maxclust') nclust = list(np.arange(1, np.sqrt(idx.max() / 2) + 1, dtype=int)) within_grp_var = [] mean_var = [] for n in nclust: idx = fcluster(self.Z, n, criterion='maxclust') grp = [np.flatnonzero(idx == c) for c in np.unique(idx)] # between_grp_var = Group([self.data[ix].R.uv for ix in grp]).var var = [100*self.data[ix].var for ix in grp] within_grp_var.append(var) mean_var.append(np.mean(var)) if not no_plot: plt.boxplot(within_grp_var, positions=nclust) plt.plot(nclust, mean_var, 'k') plt.xlabel('Number of clusters') plt.ylabel('Variance') plt.title('Within-groups variance vs. number of clusters') plt.show() else: return nclust, within_grp_var
def hcluster_cols(self, thresh): try: link = linkage(self.X.T, method='complete', metric = 'cosine') assignments = fcluster(link, thresh, 'distance') except: link = linkage(self.X.T, method='complete', metric = 'euclidean') assignments = fcluster(link, thresh, 'distance') col_ind = np.arange(len(self.crimes)) d = pd.DataFrame(zip(col_ind, assignments)).groupby(1)[0].aggregate(lambda x: tuple(x)) df_new = pd.DataFrame(index = np.arange(len(self.names))) for i in d: cols = [] for w in i: cols.append(w) if len(cols) > 1: df_new[str(self.crimes[cols])] = np.mean(self.X[:,cols], axis = 1) else: df_new[str(self.crimes[cols[0]])] = self.X[:,cols[0]] # plt.figure(figsize=(10,20)) # dendro = dendrogram(link, color_threshold=thresh, leaf_font_size=13, labels = self.crimes, orientation = 'left') # plt.subplots_adjust(top=.99, bottom=0.5, left=0.05, right=0.99) # plt.show() self.df = df_new self.crimes = df_new.columns.values
def refineEnsemble(ens, lower=.5, upper=10.): """Refine a PDB ensemble based on RMSD criterions.""" from scipy.cluster.hierarchy import linkage, fcluster from scipy.spatial.distance import squareform from collections import Counter ### calculate pairwise RMSDs ### RMSD = ens.getRMSDs(pairwise=True) # convert the RMSD table to the compressed form v = squareform(RMSD) ### apply upper threshold ### Z_upper = linkage(v, method='complete') labels = fcluster(Z_upper, upper, criterion='distance') most_common_label = Counter(labels).most_common(1)[0][0] I = np.where(labels==most_common_label)[0] ### apply lower threshold ### Z_lower = linkage(v, method='single') labels = fcluster(Z_lower, lower, criterion='distance') uniq_labels = np.unique(labels) clusters = [] for label in uniq_labels: indices = np.where(labels==label)[0] clusters.append(indices) J = np.ones(len(clusters), dtype=int) * -1 rmsd = None for i, cluster in enumerate(clusters): if len(cluster) > 0: # find the conformations with the largest coverage # (the weight of the ref should be 1) weights = [ens[j].getWeights().sum() for j in cluster] js = np.where(weights==np.max(weights))[0] # in the case where there are multiple structures with the same weight, # the one with the smallest rmsd wrt the ens._coords is selected. if len(js) > 1: # rmsd is not calulated unless necessary for the sake of efficiency rmsd = ens.getRMSDs() if rmsd is None else rmsd j = js[np.argmin(rmsd[js])] else: j = js[0] J[i] = cluster[j] else: J[i] = cluster[0] ### refine ensemble ### K = np.intersect1d(I, J) reens = ens[K] return reens
def cutTree(z, threshold, crit): try: z = np.clip(z,0,9999999) tree = hac.fcluster(z, threshold, criterion = crit) return tree except ValueError, e: print("cutTree: %s" % str(e)) tree = hac.fcluster(z, 50, criterion = "euclidean") print "negative values in matrix" return tree
def process_stay(imei,traj): # print imei,'------------------------>',traj.shape r = 20 interval = 60*8 # wfs = wfs[:1000] # traj = traj[:1000] if len(traj.shape) < 1 or traj.shape[0] <2: return x = traj['x'] y = traj['y'] in_sample = False #print x,y if sample_range is not None: for (cx,cy,cr) in sample_range: crange = math.sqrt(math.pow(cx-x[0],2) + math.pow(cy-y[0],2)) if crange < cr: in_sample = True break #ids = grid_util.get_grid_ids(np.median(x),np.median(y),300,3) if not in_sample: return ids = G.get_gridids_with_align(np.median(x),np.median(y)) # # print traj dm = get_pdist(traj,100,convert_sig = True) dm[np.abs(dm) < 1e-3] = 0 # print dm # print dm.shape #lkg = hcluster.linkage(traj[...,:2],metric = 'euclidean',method = 'average') # print dm # print dm.shape lkg = hcluster.linkage(dm,method = 'average') rst = hcluster.fcluster(lkg,0.7,criterion = 'distance') #rough dist rst_merge = hcluster.fcluster(lkg,0.2,criterion = 'distance') #rough dist seg = [] for i in range(len(rst) + 1): if i == 0 or i == len(rst) or rst[i] != rst[i-1]: seg.append(i) # # print rst # print rst_merge # print seg for (s,e) in zip(seg[:-1],seg[1:]): seg_traj = traj[s:e] seg_id = rst_merge[s:e] itl = seg_traj[-1]['t'] - seg_traj[0]['t'] if itl > interval: print_merge_fp(ids,imei,seg_traj,seg_id,itl)
def clusterTrajectories( trajectories, fname, path, metric_func=trajectoryDissimilarityL2, user_distance_matrix=None, criterion="distance" ): """ trajectories: the trajectories need to be in XY coordinates """ plot_path = utils.queryPath(path + "/plots") if user_distance_matrix is None: distance_matrix = getTrajectoryDistanceMatrix(trajectories, metric_func) writeToCSV.saveData(distance_matrix, path + "/" + fname) # save the distance_matrix else: distance_matrix = user_distance_matrix assert len(distance_matrix) == len( trajectories ), "distance_matrix (n, n) and trajectories(n) should have same number of samples" print "distance_matrix:\n", distance_matrix v = DIST.squareform(distance_matrix) cluster_result = HAC.linkage(v, method="average") dg = HAC.dendrogram(cluster_result) plt.xlabel("cluster_dengrogram_{fname}".format(fname=fname)) plt.savefig("{path}/cluster_dengrogram_{fname}.png".format(fname=fname, path=plot_path)) plt.clf() if criterion == "distance": if metric_func == trajectoryDissimilarityL2: this_cluster_label = HAC.fcluster( Z=cluster_result, t=1 * 1000, criterion="distance" ) # distance for l2 measure elif metric_func == trajectoryDissimilarityCenterMass: this_cluster_label = HAC.fcluster( Z=cluster_result, t=1.5, criterion="distance" ) # distance for center of mass measure elif criterion == "inconsistent": this_cluster_label = HAC.fcluster(Z=cluster_result, t=0.8, criterion="inconsistent") print "this_cluster_label:", this_cluster_label, "number of clusters:", len(set(this_cluster_label)) """Plot the representative trajectories""" plotRepresentativeTrajectory( this_cluster_label, trajectories, fname="cluster_centroids_{n}_classes".format(n=len(set(this_cluster_label))), path=plot_path, show=False, ) return this_cluster_label, [this_cluster_label], []
def hclustering(data, t): #row_dist = pd.DataFrame(squareform(pdist(data, metric='euclidean'))) row_dist = np.corrcoef(data) #row_dist = data row_clusters = linkage(row_dist, method='ward') ind = fcluster(row_clusters, t, criterion='maxclust') return ind
def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False): if gradient: data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0) scale = None metric = 'seuclidean' row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete') else: data_to_plot = data_array.T scale = 0 metric = 'correlation' row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete') assignments = fcluster(row_linkage, n_clusters, criterion='maxclust') cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, yticklabels=gene_names, row_linkage=row_linkage, row_colors=[settings.STATE_COLORS[i] for i in assignments]) r = np.arange(10, data_array.shape[0], data_array.shape[0]/10) plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5) cm.ax_heatmap.set_xticks(r) cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]]) cm.ax_heatmap.set_xlabel('Pseudotime') cm.ax_heatmap.set_ylabel('Gene') gene_clusters = defaultdict(list) for i, cl in enumerate(assignments): gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i]) return gene_clusters
def get_ROIs(df_sequence,x,limit_meters): # encontrar puntos de transacciones origen X,locations,pi_locations = get_latlong_points(df_sequence) if len(locations) == 1: return [[{"lat":X[0,0],"long":X[0,1]}],1.0] elif len(locations) < 1: return None # construir dendrograma Z = linkage(X,'weighted',lambda x,y: vincenty(x,y).meters) clusters = fcluster(Z,limit_meters,criterion='distance') centroids = [] nums_by_clusters =[] pi_sums = [] the_clusters = [] # join pi_sums of locations that are in the same cluster for i in range(len(clusters)): indice = buscar_locacion(the_clusters,clusters[i]) if indice < 0: the_clusters.append(clusters[i]) indice = len(the_clusters)-1 pi_sums.append(0) nums_by_clusters.append(0) centroids.append({"lat":0,"long":0}) pi_sums[indice] += pi_locations[i] centroids[indice]["lat"] += X[i,0] centroids[indice]["long"] += X[i,1] nums_by_clusters[indice] += 1 the_indexs, the_sum = get_upToX_pi_locations(np.asarray(pi_sums),x) the_centroids = [] for i in the_indexs: the_centroids.append({"lat":centroids[i]["lat"]/nums_by_clusters[i],"long":centroids[i]["long"]/nums_by_clusters[i]}) return [the_centroids,the_sum]
def clusterize_hierarchical(peakels, matrix_dist, cut, clip=False): """ :param clip: :param peakels: :param matrix_dist: :param method: :param cut: """ #having negative value in the matrix distance # leading to a valueerror # clip i order to prevent negative value in the matrix distance if clip: np.clip(matrix_dist, 0, 1, matrix_dist) k = linkage(matrix_dist, method='complete') #dist = maxdists(k) #fit = norm.fit(dist) #cut = np.percentile(dist, 10.0) #norm.ppf(5.0, loc=fit[0], scale=fit[1]) k2 = fcluster(k, cut, criterion='distance') #, criterion='distance') clust_by_id = ddict(list) for i, v in enumerate(k2): clust_by_id[v].append(peakels[i]) return clust_by_id.values()
def hist_per_stagione(start=1992, end=2012): stagione=(all_labels > start) & (all_labels < end) dist_selected=dist[ix_(stagione,stagione)] Z=linkage(squareform(dist_selected),method='complete') n=choose_p(Z) c=fcluster(Z,n,criterion='maxclust')-1 label_anni=all_labels[stagione] #order by first appearance! first_appearance=[] for i in range(0,n): first_appearance.append(min(label_anni[c==i])) order1=[index for key,index in sorted(zip(first_appearance,range(0,n)))] order2=[index for key,index in sorted(zip(order1,range(0,n)))] order=array(order2) c=order[c] #draw scatter plot scatter(label_anni,c,s=100,c=c) #grid(b=True,axis='y') yticks(range(0,n+1)) xlim((min(label_anni)-0.5,max(label_anni)+0.5)) ax=gca() for i in range(1993,2011+1): ax.add_line(Line2D([i+7./12,i+7./12],[0,n+1],linestyle='--')) show()
def main(): #clustering and write output if len(pep_array)>1: matrix=[] for i in range(0,len(pep_array)): matrix.append(pep_array[i][4].replace('\"',"").split(',')) dataMatrix=numpy.array(matrix,dtype=float) d = sch.distance.pdist(dataMatrix,metric)# vector of pairwise distances if metric=="correlation": D = numpy.clip(d,0,2) #when using correlation, all values in distance matrix should be in range[0,2] else: D=d try: cutoff=float(t) except ValueError: print "please provide a numeric value for --t"; sys.exit() L = sch.linkage(D, method,metric) ind = sch.fcluster(L,cutoff,'distance')#distance is dissmilarity(1-correlation) p=numpy.array(pep_array) p=numpy.column_stack([p,ind]) formatoutput(p) else: p=numpy.array(pep_array) p=numpy.column_stack([p,[0]]) formatoutput(p)
def order(self, method='complete', metric='euclidean', inplace=False): """ Rearrange the order of rows and columns after clustering :param method: any scipy method (e.g., single, average, centroid, median, ward). See scipy.cluster.hierarchy.linkage :param metric: any scipy distance (euclidean, hamming, jaccard) See scipy.spatial.distance or scipy.cluster.hieararchy :param bool inplace: if set to True, the dataframe is replaced You probably do not need to use that method. Use :meth:`plot` and the two parameters order_metric and order_method instead. """ from scipy.cluster.hierarchy import fcluster, dendrogram Y = self.linkage(self.df, method=method, metric=metric) ind1 = fcluster(Y, 0.7 * max(Y[:, 2]), 'distance') Z = dendrogram(Y, no_plot=True) idx1 = Z['leaves'] cor2 = self.df.ix[idx1][idx1] if inplace is True: self.df = cor2 else: return cor2 self.Y = Y self.Z = Z self.idx1 = idx1 self.ind1 = ind1
def get_cluster(self,cluster_data): cluster_value = [] for index in range(len(cluster_data)): if cluster_data[index] == True: cluster_value.append(index) dimension = len(cluster_value) distance_matrix=[[0 for row in range(dimension)] for col in range(dimension)] for row in range(dimension): for col in range(dimension): distance_matrix[row][col]=abs(cluster_value[row]-cluster_value[col]) distance_array = distance.squareform(distance_matrix) clusters=hierarchy.linkage(distance_array, method='weighted', metric='euclidean') T = hierarchy.fcluster(clusters, self.cluster_distance, criterion='distance') temp_holder = {} for item in range(max(T)): temp_holder[item+1] = [] for index in range(dimension): temp_holder[T[index]].append(cluster_value[index]) # print temp_holder # print T # print cluster_value return temp_holder
def optimal_cutoff(Y,dist_mat,min_size): labels = np.array([sch.fcluster(Y,c,criterion='distance') for c in Y[:,2]]) score = np.array([metrics.silhouette_score(dist_mat,l) for l in labels[:-min_size]]) c = Y[:-min_size,2] f = interp(c,-score,kind='linear') opt_c = opt.fmin(f,x0=c[2*min_size]) return opt_c
def _run_hier_clust_on_centroids(self,method='average'): ''' runs hierarchical clustering based on the centroids of the data per scipy's methods ''' uniqueLabels = np.sort(np.unique(self.templateLabels)) centroids = np.array([self.templateMat[np.where(self.templateLabels == i)[0],:].mean(axis=0) for i in uniqueLabels]) self.y = pdist(centroids) self.z = hierarchy.linkage(self.y,method) r2 = hierarchy.inconsistent(self.z,2) ## rank the average of linkage hieghts by standard deviation the report the averages meanHeights = r2[:,0] stdHeights = r2[:,1] rankedInds = np.argsort(stdHeights)[::-1] bestCutPoints = meanHeights[rankedInds] ## save centroid labels for all cuts of the dentragram allCentroidLabels = {} rankedK = [] for cp in bestCutPoints: centroidLabels = hierarchy.fcluster(self.z,t=cp,criterion='distance') k = len(np.unique(centroidLabels)) if allCentroidLabels.has_key(str(k)) == True: continue allCentroidLabels[str(k)] = centroidLabels rankedK.append(k) centroidLabels = allCentroidLabels[str(rankedK[0])] ## save the top xx modes self.bestModeLabels = [] print 'doing ranking...' for rk in rankedK[:25]: centroidLabels = allCentroidLabels[str(rk)] modeLabels = self._get_mode_labels(self.templateLabels,centroidLabels,uniqueLabels) self.bestModeLabels.append(modeLabels) ## provide silvalue ranks in case we wish to reorder the top xx modes by sil value self.modeSilValues = [] self.modeSizes = [] allEvents = [self.templateData] for count in range(len(self.bestModeLabels)): numClusters = np.unique(self.bestModeLabels[count]).size silValues = get_silhouette_values(allEvents,[self.bestModeLabels[count]],subsample=self.noiseSample, minNumEvents=5000,resultsType='raw') silMean = silValues['0'].mean() self.modeSilValues.append(silValues['0'].mean()) self.modeSizes.append(numClusters) silValues = get_silhouette_values(allEvents,[self.templateLabels],subsample=self.noiseSample, minNumEvents=5000,resultsType='raw') self.clusterSilValues = silValues['0'].mean() self.modeSilValues = np.array(self.modeSilValues) self.modeSizes = np.array(self.modeSizes)
def user_fp_group(data,key,user,filter = 'mid',merge = False,thr = 0.2): #data = np.fromiter(data,dtype = dt) if len(data.shape) == 0 or data.shape[0] == 1: print '\t'.join([key,user,'%s' % data['wf_list'],str(data['x']),str(data['y']),'1']) return dists = get_pdist(data,100) #print dists clusters = hcluster.linkage(dists,method = 'average') # print clusters r = hcluster.fcluster(clusters,thr,'distance') ids = np.unique(r) sz = [] for id in ids: sz.append(data[r==id].shape[0]) mid_size = max(1.1,max(sz) / 2.0) for id in ids: d = data[r==id] if filter == 'mid' and d.shape[0] < mid_size: continue if merge == True: print '\t'.join([key,user,wf_to_str(get_mean_wf(d)),str(np.median(d['x'])),str(np.median(d['y'])),str(get_largest_dur(d)),str(d.shape[0])]) continue for od in d: print '\t'.join([key,user,od['wf_list'],str(od['x']),str(od['y']),str(od['t']),str(id)])
def process(tag,infos,wf_lists,count): if wf_lists == None or infos == None: return x = infos['x'] y = infos['y'] imeis = infos['imei'] #wf_lists = np.fromiter(wf_lists,dtype = np.array) std_x = np.std(x) std_y = np.std(y) users_num = len(np.unique(imeis)) if users_num < 3: return if len(wf_lists.shape) < 2 or wf_lists.shape[1] < 2: return dists = sci_dist.pdist(wf_lists,'cosine') dists[(dists < 1e-10)] = 0 clusters = hierarchy.linkage(dists,method ='average') r = hierarchy.fcluster(clusters,0.3,'distance') for c in np.unique(r): idx = (r==c) c_x = np.median(x[idx] ) c_y = np.median(y[idx] ) c_std_x = np.std(x[idx]) c_std_y = np.std(y[idx]) c_user = len(np.unique(imeis[idx])) wfs = wf_lists[idx] wf = np.sum(wfs,axis=0) / len(wfs) wf = [ '%d' % sig for sig in wf ] print '%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (tag,'\t'.join(wf),c_x,c_y,c_user,std_x,std_y,c_std_x,c_std_y,count)
def dunnindex_clusternumber(linkage,df_zscores, low=1, high=5,output_dir ="."): index_list=[] for n_clusters in range(low,high): assignments = hierarchy.fcluster(linkage, n_clusters, criterion="maxclust") df_assign_id = pd.DataFrame() df_assign_id['cluster_id'] = assignments clusters = np.unique(assignments) cluster_list = [] # for dunn index calculation for i in clusters: ids = np.nonzero(assignments == i)[0] # starting from 0 df_zscore_cluster = df_zscores.iloc[ids] cluster_list.append(df_zscore_cluster.values) dunn_index = dunn(cluster_list) print n_clusters, ":", dunn_index index_list.append(dunn_index) pl.figure() pl.plot(range(low,high),index_list,"*-") pl.xlabel("cluster number") pl.ylabel("dunn index") pl.savefig(output_dir+'/dunnindex_clusternumber.png') #pl.show() return
def main(): #clustering and write output matrix=[] for i in range(0,len(proteinarray)): if calculate_ratio=="True": ratio_array=convert2ratio(proteinarray[i][2:],ref) matrix.append(ratio_array) else: matrix.append(proteinarray[i][2:]) dataMatrix=np.array(matrix,dtype=float) if log_transform=="True": dataMatrix=np.log2(dataMatrix) if len(proteinarray)>1: d = sch.distance.pdist(dataMatrix,metric)# vector of pairwise distances if metric=="correlation": D = np.clip(d,0,2) #when using correlation, all values in distance matrix should be in range[0,2] else: D=d try: cutoff=float(t) except ValueError: print "please provide a numeric value for --t"; sys.exit() L = sch.linkage(D, method,metric) ind = sch.fcluster(L,cutoff,'distance')#distance is dissmilarity(1-correlation) p=np.array(proteinarray)[:,[0,1]] #slice first and second column of original data p=np.concatenate((p,dataMatrix),axis=1) # replace transformed data p=np.column_stack([p,ind]) # add cluster result to the last column formatoutput(p) else: p=np.array(proteinarray)[:,[0,1]] p=np.concatenate((p,dataMatrix),axis=1) p=np.column_stack([p,[0]]) formatoutput(p)
def create_hc(G, t=1.0): """ Creates hierarchical cluster of graph G from distance matrix Maksim Tsvetovat ->> Generalized HC pre- and post-processing to work on labelled graphs and return labelled clusters The threshold value is now parameterized; useful range should be determined experimentally with each dataset """ """Modified from code by Drew Conway""" ## Create a shortest-path distance matrix, while preserving node labels labels=G.nodes() path_length=nx.all_pairs_shortest_path_length(G) distances=numpy.zeros((len(G),len(G))) i=0 for u,p in path_length.items(): j=0 for v,d in p.items(): distances[i][j]=d distances[j][i]=d if i==j: distances[i][j]=0 j+=1 i+=1 # Create hierarchical cluster Y=distance.squareform(distances) Z=hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrive purposes membership=list(hierarchy.fcluster(Z,t=t)) # Create collection of lists for blockmodel partition=defaultdict(list) for n,p in zip(list(range(len(G))),membership): partition[p].append(labels[n]) return list(partition.values())
def ward_cluster(df_all, feature_names, max_cluster_num, output_dir, swc_path= None, RemoveOutliers = 0, datasetType='ivscc', plot_heatmap =1): print("\n\n\n *************** ward computation, max_cluster = %d *************:" % max_cluster_num) if not os.path.exists(output_dir): os.mkdir(output_dir) ##### zscores featuer plots df_zscores, df_all_outlier_removed, df_outliers = get_zscore_features(df_all, feature_names, output_dir + '/zscore.csv', RemoveOutliers) if (df_outliers.shape[0] > 0 ): output_single_cluster_results(df_outliers, output_dir, "outliers", swc_path) if plot_heatmap: if datasetType =='ivscc': link = heatmap_plot_zscore_ivscc(df_zscores, df_all_outlier_removed, output_dir, "feature zscores") if datasetType =='bbp': link = heatmap_plot_zscore_bbp(df_zscores, df_all_outlier_removed, output_dir, "feature zscores") if datasetType =='bigneuron': link = heatmap_plot_zscore_bigneuron(df_zscores, df_all_outlier_removed, output_dir, "feature zscores") else: link = hierarchy.linkage(df_zscores, method='ward', metric='euclidean') assignments = hierarchy.fcluster(link, max_cluster_num, criterion="maxclust") output_clusters(assignments, df_zscores, df_all_outlier_removed, feature_names, output_dir, swc_path) truncate_dendrogram(link,max_cluster_num,output_dir,0) return link,df_zscores
def flatcluster( dRow, runLogs, interClusterDistance="complete", plotDendrogram=True, cMethod="inconsistent", cValue=2.5 ): # if 'inter-cluster distance' in clusterSetup.keys(): # method = clusterSetup['inter-cluster distance'] # else: # method = 'complete' z = linkage(dRow, interClusterDistance) inc = inconsistent(z) # print inc if plotDendrogram: plotdendrogram(z) clusters = fcluster(z, cValue, cMethod) noClusters = max(clusters) print("Total number of clusters:", noClusters) for i in range(noClusters): counter = 0 for j in range(len(clusters)): if clusters[j] == (i + 1): counter += 1 print("Cluster", str(i + 1), ":", str(counter)) global clusterCount clusterCount = noClusters print(len(clusters)) print(len(runLogs)) for i, log in enumerate(runLogs): log[0]["Cluster"] = str(clusters[i]) return z, clusters, runLogs
def run_ngram_model(cdev, cprc): print '____________________________________________________' print 'running n-gram model' wcorp = [] for i in cprc: wcorp.append(' '.join(cprc[i]['words'])) vectorizer = CountVectorizer(analyzer='word', binary=True, min_df=max(int(len(wcorp)*0.0005), 5), ngram_range=(2,3)) X = vectorizer.fit_transform(wcorp) Xclean, mapping = filter_rare(X) Xdense = np.matrix(Xclean).astype('float') X_scaled = preprocessing.scale(Xdense) X_normalized = preprocessing.normalize(X_scaled, norm='l2') textMatrix = pairwise_distances(X_normalized, metric='cosine') L = fastcluster.linkage(textMatrix, method='average') flat_textclust = hierarchy.fcluster(L, 0.5, 'distance') ttc = organize_clusters(flat_textclust) ncf = [] for cl in ttc: ncf.append([mapping[t] for t in cl]) print 'detected', len(ncf), 'n-gram clusters' return ncf
def cluster_words(k): ts = os.listdir('types') ts.sort(key=alphanum_key) ts = np.array(ts) T = fcluster(Z,k,criterion='maxclust') def words(i): cluster = ts[T == i] print(len(cluster)) allwords = [] for t in cluster: fname = 'types/{}'.format(t) with open(fname) as file: data = json.loads(file.read()) desc = data['description'] words = re.findall('\w+', desc.lower()) allwords.extend(words) allwords = [word for word in allwords if word not in stop_words] counts = Counter(allwords) return counts return [words(i+1) for i in range(k)]
def run_entity_model(cdev, cprc): print '____________________________________________________' print 'running entity model' hdev, hprc, hmapping, entcorp, er = process_entities(cdev, cprc) print 'removed', len(cdev)- len(hdev), 'documents', len(hdev), 'left' voc = build_voc(entcorp, 2) ent_vectorizer = CountVectorizer(vocabulary = voc) E = ent_vectorizer.fit_transform(hdev) Eclean, emapping = filter_rare(E, 0) E_dense = np.matrix(Eclean).astype('float') E_scaled = preprocessing.scale(E_dense) E_normalized = preprocessing.normalize(E_scaled, norm='l2') EMatrix = pairwise_distances(E_normalized, metric='cosine') EL = fastcluster.linkage(EMatrix, method='average') flat_eclust = hierarchy.fcluster(EL, 0.5, 'distance') ec = organize_clusters(flat_eclust, th = 3) ecf = [] for cl in ec: ecf.append([hmapping[emapping[t]] for t in cl]) print 'detected', len(ecf), 'entity clusters' return ecf, voc
def main(): # distMatrix = loadDistanceMatrix() # linkage = saveLinkage(distMatrix) # linkage = loadLinkage() # loadFCluster() # R = dendrogram(linkage, truncate_mode='level', p=4, show_contracted=True) # afile = open(r'/home/rojosewe/Dropbox/MAI90/tesis/structs/R5000.pkl', 'wb') # pickle.dump(R, afile); # afile.close(); linkage = loadLinkage() print len(linkage) k = 1.5 # 18 -> 54 # 19 -> 46 R = dendrogram(linkage, color_threshold=6.8, show_contracted=True) pylab.savefig( "/home/rojosewe/Dropbox/MAI90/tesis/images/wordClustering/dgram446.8.png" ) # print "cheese!" T = sch.fcluster(linkage, k, 'distance') n = len(T) # print len(T) # calculate labels labels = np.zeros((n, 1)) print str(k) + ": " + str(max(T)) for i in range(n): labels[i,0] = int(T[i]); with open(datafolder + 'labels.csv', 'wb') as csvfile: csvw = csv.writer(csvfile); for i in range(n): csvw.writerow(labels[i,:]) print 'done writing'
def clustering_scipy_dendrogram(features, n_clust, metric='euclidean', method = 'complete'): """ """ #x = pdist(features, metric) z = hac.linkage(features, method = method) #d = hac.dendrogram(z, p=30, truncate_mode=None, color_threshold=None, get_leaves=True, orientation='top', labels=None, count_sort=False, distance_sort=False, show_leaf_counts=True, no_plot=False, no_labels=False, color_list=None, leaf_font_size=None, leaf_rotation=None, leaf_label_func=None, no_leaves=False, show_contracted=False, link_color_func=None) #plt.show() #num_col = d['color_list'] #cnt = Counter(num_col) #print cnt #n_clust = 100 clusters = hac.fcluster(z, n_clust, criterion='maxclust') #print clusters num_elem = Counter(clusters) print num_elem centroids = to_codebook(features, clusters) #print temp #for i in range(len(temp)): #plt.plot(temp[i]) #fig = plt.figure() #for ii in range(len(centroids)): #plt.subplot(4,2,ii) #plt.plot(centroids[ii]) #plt.ylabel(np.array(ii+1)) #plt.show() np.save('centroids',np.array(centroids)) return clusters, centroids
def run_hierarchical( self, dm, nclusters, linkage_method, ): if dm.metric == 'rf': matrix = dm.add_noise(dm.matrix) else: matrix = dm.matrix linkmat = linkage(matrix, linkage_method) linkmat_size = len(linkmat) if nclusters <= 1: br_top = linkmat[linkmat_size - nclusters][2] else: br_top = linkmat[linkmat_size - nclusters + 1][2] if nclusters >= len(linkmat): br_bottom = 0 else: br_bottom = linkmat[linkmat_size - nclusters][2] threshold = 0.5 * (br_top + br_bottom) T = fcluster(linkmat, threshold, criterion='distance') T = self.order(T) return T
def retrieve_cluster(self, number): """rr""" self.clusters = sch.fcluster(self.linkage, number, criterion='maxclust') return
def linkage(dmat): square = squareform(dmat) #needed for linkage methdos linkmat = sch.single(square) return sch.fcluster(linkmat,0.0001)
def addZone(poi, max_d): """assign a zone to the poi, clustering""" Z = linkage(poi[['x', 'y']], 'ward') zoneL = fcluster(Z, max_d, criterion='distance') # newZone = np.isnan(poi['id_zone']) return zoneL
def perform_inference(model, points, inferenceMethods, threshDict, metricsForEval, scaleDist): """ :param model: :param points: Dictionary with key as pid and value as (pointVector, clusterId) :param inferenceMethods: :param threshDict: :param metricsForEval: :param scaleDist: :return: """ torch.cuda.empty_cache() start = time.time() pidList = sorted(list(points.keys())) pointList = [points[pid][0] for pid in pidList] pidToGtClust= {pid:points[pid][1] for pid in pidList} gtList = {points[pid][1]:0 for pid in pidList} for pid in pidList: gtList[points[pid][1]] += 1 numPoints = len(pointList) results = {} transformedPointList = None numComponents = 0 if isinstance(model, MahalanobisDist): # Tranform points if using Mahalanobis distance transformedPointList = model.transformPoints(pointList) linkMetric = "euclidean" else: raise Exception("Can not perform inference in this function with model type={}".format(type(model))) dendPurity = 0 torchDistMat = model.batchForwardWithin(pointList) distMat_NP = torchDistMat.cpu().data.numpy() y_true = [] for idx, pid in enumerate(pidList): y_true.append(pidToGtClust[pid]) for method in inferenceMethods: mStart = time.time() if method == "connComp": t1 = time.time() connCompThresh = threshDict["connComp"] sparseMatrix = comp_sparse_adj_mat(model, pointList, connCompThresh) t2 = time.time() print("Time taken for computing sparseMatrix:{:.3f}".format(t2-t1)) x = connected_components(sparseMatrix) numComponents = x[0] connectedComponents = x[1] y_pred = [] for idx,pid in enumerate(pidList): y_pred.append( connectedComponents[idx] ) elif method == "recSparsest": labels = np.array([points[pid][1] for pid in pidList]) new_dist_mat_NP =np.max(distMat_NP) - distMat_NP linkTree = run_sparsest_cut(new_dist_mat_NP, labels) y_pred = y_true if "dendPurity" in metricsForEval: dendPurity = calc_dend_purity(linkTree=linkTree, pidList=pidList, y_true=y_true) elif method == "random": y_pred, dendPurity = run_random_split(pidToCluster=pidToGtClust, k=len(gtList)) elif method.startswith("linkage"): if method == "linkage_min" or method == "linkage_max": # raise Exception("Use singleLink or compLink inference method instead") linkageAlpha = method[-3:] flatClusters, dendPurity = runHAC(origDistMat=distMat_NP, k=len(gtList), linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=pidToGtClust, threshold=None, scaleDist=scaleDist) y_pred = flatClusters elif method == "linkage_min@t" or method == "linkage_max@t": # raise Exception("Use singleLink or compLink inference method instead") linkageAlpha = method[-5:-2] threshold = threshDict[method] flatClusters, dendPurity = runHAC(origDistMat=distMat_NP, k=None, linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=None, threshold=threshold, scaleDist=scaleDist) y_pred = flatClusters else: if method.startswith("linkage_auto"): if hasattr(model, "linkAlpha"): linkageAlpha = float(model.linkAlpha.data.cpu().numpy()[0]) else: print("Not evaluating for method = {}".format(method, str(model))) continue else: try: if method.endswith("@t"): linkageAlpha = float(method[:-2].split("_")[-1]) else: linkageAlpha = float(method.split("_")[-1]) except: raise Exception("Invalid value of linkageAlpha = {}. Eg use method=linkage_1.0".format(method)) if method.endswith("@t"): # Use a threshold to get flat clusters threshold = threshDict[method] flatClusters, dendPurity = runHAC_allEdges(origDistMat=distMat_NP, k=None, linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=None, threshold=threshold, scaleDist=scaleDist) else: # Use number of gt clusters to get flat clusters if "dendPurity" in metricsForEval: flatClusters, dendPurity = runHAC_allEdges(origDistMat=distMat_NP, k=len(gtList), linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=pidToGtClust, threshold=None, scaleDist=scaleDist) else: # No need to pass pidToCluster as we don't need to compute dendPurity flatClusters, dendPurity = runHAC_allEdges(origDistMat=distMat_NP, k=len(gtList), linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=None, threshold=None, scaleDist=scaleDist) y_pred = flatClusters # ptToPredClusters = {point:y_pred[ctr] for ctr,point in enumerate(pointList)} # print("Plotting in file=",method + ".pdf") # plot_clusters(pointToCluster=ptToPredClusters,filename=method + ".pdf") # ptToGtClusters = {point:y_true[ctr] for ctr,point in enumerate(pointList)} # plot_clusters(pointToCluster=ptToGtClusters,filename=method + "_orig.pdf") else: if method.startswith("singleLink"): threshold = threshDict["singleLink@t"] if "singleLink@t" in threshDict else None linkTree = linkage(transformedPointList, "single",metric=linkMetric) elif method.startswith("avgLink"): threshold = threshDict["avgLink@t"] if "avgLink@t" in threshDict else None linkTree = linkage(transformedPointList, "average",metric=linkMetric) elif method.startswith("compLink"): threshold = threshDict["compLink@t"] if "compLink@t" in threshDict else None linkTree = linkage(transformedPointList, "complete",metric=linkMetric) else: linkTree = None print("Invalid inference method:{}".format(method)) raise Exception("Invalid inference method:{}".format(method)) if method.endswith("@t"): flatClusters = fcluster(Z=linkTree, t=threshold, criterion="distance") else: flatClusters = fcluster(Z=linkTree, t=len(gtList), criterion="maxclust") y_pred = flatClusters # ptToPredClusters = {point:y_pred[ctr] for ctr,point in enumerate(pointList)} # plot_clusters(pointToCluster=ptToPredClusters,filename=method + ".pdf") if "dendPurity" in metricsForEval: dendPurity = calc_dend_purity(linkTree=linkTree, pidList=pidList, y_true=y_true) mEnd = time.time() print("Time taken by inference method:{} = {:.3f}".format(method, mEnd - mStart)) if "f1" in metricsForEval: tempResult = comp_prec_rec_f1(y_true, y_pred) for metric in tempResult: results[method + "_" + metric] = tempResult[metric] if "randIndex" in metricsForEval: results[method + "_randIndex"] = adjusted_rand_score(y_true, y_pred) if "nmi" in metricsForEval: results[method + "_nmi"] = adjusted_mutual_info_score(y_true, y_pred, average_method="arithmetic") if "dendPurity" in metricsForEval: results[method + "_dendPurity"] = 0 if method == "connComp" else dendPurity print("Inference Time:{:.3f} on {} points".format(time.time() - start,numPoints)) return results
def _cluster_by_monocrit(linkage_table: numpy.ndarray, cutoff: float, inconsistent: pandas.DataFrame) -> numpy.ndarray: MR = hierarchy.maxRstat(linkage_table, inconsistent.values, 1) clusters = hierarchy.fcluster(linkage_table, t = cutoff, criterion = 'monocrit', monocrit = MR) return clusters
def gen_data(n_per_cat): cov = np.eye(2)*0.2 X0 = np.random.multivariate_normal([-2.0, 0.0], cov, n_per_cat) X1 = np.random.multivariate_normal([2.0, 0.0], cov, n_per_cat) X2 = np.random.multivariate_normal([0.0, 1.8], cov, n_per_cat) data = np.vstack((X0, X1, X2)) return data hypers = { 'mu_0': np.zeros(2), 'nu_0': 3.0, 'lambda_0': 1.0, 'psi_0': np.eye(2) } data_model = NormalInverseWishart(**hypers) # Sanity check: grab the assignment that has three components and do a # visual verification. data = gen_data(15) linkage_matrix = bhc(data, data_model) print(linkage_matrix) # print('len of lml',len(lmls),'len of data',len(asgn[0])) # print(makeLinkageMatrix(asgn,lmls)) dn = dendrogram(linkage_matrix) plt.show() z = fcluster(linkage_matrix, 3, 'maxclust') plt.figure(tight_layout=True, facecolor='white') plt.scatter(data[:, 0], data[:, 1], c=z, cmap='Set1', s=225) plt.show()
def make_figure(df, pa): """Generates figure. Args: df (pandas.core.frame.DataFrame): Pandas DataFrame containing the input data. pa (dict): A dictionary of the style { "argument":"value"} as outputted by `figure_defaults`. Returns: A Plotly figure. A Pandas DataFrame with columns clusters. A Pandas DataFrame with rows clusters. A Pandas DataFrame as displayed in the the Maptlotlib figure. """ #fig = go.Figure( ) #fig.update_layout( width=pa_["fig_width"], height=pa_["fig_height"] ) # autosize=False, tmp = df.copy() tmp.index = tmp[pa["xvals"]].tolist() tmp = tmp[pa["yvals"]] if pa["add_constant"] != "": tmp = tmp + float(pa["add_constant"]) if pa["log_transform_value"] == "log2": tmp = np.log2(tmp) elif pa["log_transform_value"] == "log10": tmp = np.log10(tmp) pa_ = {} checkboxes = [ "row_cluster", "col_cluster", "xticklabels", "yticklabels", "row_dendogram_dist", "col_dendogram_dist", "reverse_color_scale" ] # "robust" for c in checkboxes: if (pa[c] == "on") | (pa[c] == ".on"): pa_[c] = True else: pa_[c] = False for v in [ "col_color_threshold", "row_color_threshold", "upper_value", "center_value", "lower_value" ]: if pa[v] == "": pa_[v] = None else: pa_[v] = float(pa[v]) if pa_["reverse_color_scale"]: pa_["colorscale_value"] = pa["colorscale_value"] + "_r" else: pa_["colorscale_value"] = pa["colorscale_value"] selfdefined_cmap = True for value in [ "lower_value", "center_value", "upper_value", "lower_color", "center_color", "upper_color" ]: if pa[value] == "": selfdefined_cmap = False break if selfdefined_cmap: range_diff = float(pa["upper_value"]) - float(pa["lower_value"]) center = float(pa["center_value"]) - float(pa["lower_value"]) center = center / range_diff color_continuous_scale=[ [0, pa["lower_color"]],\ [center, pa["center_color"]],\ [1, pa["upper_color"] ]] pa_["colorscale_value"] = color_continuous_scale if pa["zscore_value"] == "row": tmp = pd.DataFrame(stats.zscore(tmp, axis=1, ddof=1), columns=tmp.columns.tolist(), index=tmp.index.tolist()) elif pa["zscore_value"] == "columns": tmp = pd.DataFrame(stats.zscore(tmp, axis=0, ddof=1), columns=tmp.columns.tolist(), index=tmp.index.tolist()) if len(pa["findrow"]) > 0: rows_to_find = pa["findrow"] possible_rows = tmp.index.tolist() not_found = [s for s in rows_to_find if s not in possible_rows] if len(not_found) > 0: message = "˜The following rows could not be found: %s. Please check your entries for typos." % ( ", ".join(not_found)) flash(message, 'error') rows_to_plot = [] + rows_to_find if (pa["findrowup"] != "") | (pa["findrowdown"] != ""): d = scs.distance.pdist(tmp, metric=pa["distance_value"]) d = squareform(d) d = pd.DataFrame(d, columns=tmp.index.tolist(), index=tmp.index.tolist()) d = d[rows_to_find] for r in rows_to_find: dfrow = d[[r]] if pa["findrowtype_value"] == "percentile": row_values = dfrow[r].tolist() if pa["findrowup"] != "": upperc = np.percentile(row_values, float(pa["findrowup"])) upperc = dfrow[dfrow[r] >= upperc] rows_to_plot = rows_to_plot + upperc.index.tolist() if pa["findrowdown"] != "": downperc = np.percentile(row_values, float(pa["findrowdown"])) downperc = dfrow[dfrow[r] <= downperc] rows_to_plot = rows_to_plot + downperc.index.tolist() if pa["findrowtype_value"] == "n rows": dfrow = dfrow.sort_values(by=[r], ascending=True) row_values = dfrow.index.tolist() if pa["findrowdown"] != "": rows_to_plot = rows_to_plot + row_values[:int( pa["findrowdown"])] if pa["findrowup"] != "": rows_to_plot = rows_to_plot + row_values[ -int(pa["findrowup"]):] if pa["findrowtype_value"] == "absolute": if pa["findrowup"] != "": upperc = dfrow[dfrow[r] >= float(pa["findrowup"])] rows_to_plot = rows_to_plot + upperc.index.tolist() if pa["findrowdown"] != "": downperc = dfrow[dfrow[r] <= float(pa["findrowdown"])] rows_to_plot = rows_to_plot + downperc.index.tolist() rows_to_plot = list(set(rows_to_plot)) tmp = tmp[tmp.index.isin(rows_to_plot)] data_array = tmp.values data_array_ = tmp.transpose().values labels = tmp.columns.tolist() rows = tmp.index.tolist() # # Initialize figure by creating upper dendrogram if pa_["col_cluster"]: fig = ff.create_dendrogram(data_array_, orientation='bottom', labels=labels, color_threshold=pa_["col_color_threshold"],\ distfun = lambda x: scs.distance.pdist(x, metric=pa["distance_value"]),\ linkagefun= lambda x: sch.linkage(x, pa["method_value"])) for i in range(len(fig['data'])): fig['data'][i]['yaxis'] = 'y2' dendro_leaves_y_labels = fig['layout']['xaxis']['ticktext'] #dendro_leaves_y = [ labels.index(i) for i in dendro_leaves_y_labels ] #for data in dendro_up['data']: # fig.add_trace(data) if pa_["col_color_threshold"]: d = scs.distance.pdist(data_array_, metric=pa["distance_value"]) Z = sch.linkage(d, pa["method_value"]) #linkagefun(d) max_d = pa_["col_color_threshold"] clusters_cols = fcluster(Z, max_d, criterion='distance') clusters_cols = pd.DataFrame({ "col": tmp.columns.tolist(), "cluster": list(clusters_cols) }) else: clusters_cols = pd.DataFrame({"col": tmp.columns.tolist()}) else: fig = go.Figure() dendro_leaves_y_labels = tmp.columns.tolist() dendro_leaves_y = [labels.index(i) for i in dendro_leaves_y_labels] # Create Side Dendrogram if pa_["row_cluster"]: dendro_side = ff.create_dendrogram(data_array, orientation='right', labels=rows, color_threshold=pa_["row_color_threshold"],\ distfun = lambda x: scs.distance.pdist(x, metric=pa["distance_value"]),\ linkagefun= lambda x: sch.linkage(x, pa["method_value"] )) for i in range(len(dendro_side['data'])): dendro_side['data'][i]['xaxis'] = 'x2' dendro_leaves_x_labels = dendro_side['layout']['yaxis']['ticktext'] #dendro_leaves_x = [ rows.index(i) for i in dendro_leaves_x_labels ] if pa_["row_color_threshold"]: d = scs.distance.pdist(data_array, metric=pa["distance_value"]) Z = sch.linkage(d, pa["method_value"]) #linkagefun(d) max_d = pa_["row_color_threshold"] clusters_rows = fcluster(Z, max_d, criterion='distance') clusters_rows = pd.DataFrame({ "col": tmp.index.tolist(), "cluster": list(clusters_rows) }) else: clusters_rows = pd.DataFrame({"col": tmp.index.tolist()}) #if pa_["col_cluster"]: # Add Side Dendrogram Data to Figure #print(dendro_side['data'][0]) for data in dendro_side['data']: fig.add_trace(data) #else: # fig=dendro_side else: dendro_leaves_x_labels = tmp.index.tolist() dendro_leaves_x = [rows.index(i) for i in dendro_leaves_x_labels] if pa["robust"] != "": vals = tmp.values.flatten() up = np.percentile(vals, 100 - float(pa["robust"])) down = np.percentile(vals, float(pa["robust"])) tmp[tmp > up] = up tmp[tmp < down] = down data_array = tmp.values # Create Heatmap heat_data = data_array heat_data = heat_data[dendro_leaves_x, :] heat_data = heat_data[:, dendro_leaves_y] heatmap = [ go.Heatmap(x=dendro_leaves_x_labels, y=dendro_leaves_y_labels, z=heat_data, zmax=pa_["upper_value"], zmid=pa_["center_value"], zmin=pa_["lower_value"], colorscale=pa_['colorscale_value'], colorbar={ "title": { "text": pa["color_bar_label"], "font": { "size": float(pa["color_bar_font_size"]) } }, "lenmode": "pixels", "len": float(pa["fig_height"]) / 4, "xpad": float(pa["color_bar_horizontal_padding"]), "tickfont": { "size": float(pa["color_bar_ticks_font_size"]) } }) ] if pa_["col_cluster"]: heatmap[0]['x'] = fig['layout']['xaxis']['tickvals'] else: heatmap[0]['x'] = dendro_leaves_y_labels if pa_["row_cluster"]: heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals'] else: fake_vals = [] i = 0 for f in range(len(dendro_leaves_x_labels)): fake_vals.append(i) i += 1 #dendro_leaves_x_labels=tuple(fake_vals) heatmap[0]['y'] = tuple(fake_vals) #dendro_leaves_x_labels # Add Heatmap Data to Figure # if (pa_["col_cluster"]) | (pa_["row_cluster"]): for data in heatmap: fig.add_trace(data) # else: # fig = go.Figure(data=heatmap[0]) # Edit Layout fig.update_layout({ 'width': float(pa["fig_width"]), 'height': float(pa["fig_height"]), 'showlegend': False, 'hovermode': 'closest', "yaxis": { "mirror": "allticks", 'side': 'right', 'showticklabels': pa_["xticklabels"], 'ticktext': dendro_leaves_x_labels }, "xaxis": { "mirror": "allticks", 'side': 'right', 'showticklabels': pa_["yticklabels"], 'ticktext': dendro_leaves_y_labels } }) # Edit xaxis fig.update_layout(xaxis={'domain': [ float(pa["row_dendogram_ratio"]), 1], 'mirror': False, 'showgrid': False, 'showline': False, 'zeroline': False, 'showticklabels': pa_["yticklabels"], "tickfont":{"size":float(pa["yaxis_font_size"])}, 'ticks':"",\ 'ticktext':dendro_leaves_y_labels}) # Edit xaxis2 if pa_["row_cluster"]: fig.update_layout( xaxis2={ 'domain': [0, float(pa["row_dendogram_ratio"])], 'mirror': False, 'showgrid': False, 'showline': False, 'zeroline': False, 'showticklabels': pa_["row_dendogram_dist"], 'ticks': "" }) # Edit yaxis fig.update_layout(yaxis={'domain': [0, 1-float(pa["col_dendogram_ratio"]) ], 'mirror': False, 'showgrid': False, 'showline': False, 'zeroline': False, 'showticklabels': pa_["xticklabels"], "tickfont":{"size":float(pa["xaxis_font_size"])} , 'ticks': "",\ 'tickvals':heatmap[0]['y'],\ 'ticktext':dendro_leaves_x_labels}) #'tickvals':dendro_side['layout']['yaxis']['tickvals'],\ # Edit yaxis2 showticklabels if pa_["col_cluster"]: fig.update_layout( yaxis2={ 'domain': [1 - float(pa["col_dendogram_ratio"]), 1], 'mirror': False, 'showgrid': False, 'showline': False, 'zeroline': False, 'showticklabels': pa_["col_dendogram_dist"], 'ticks': "" }) fig.update_layout(template='plotly_white') fig.update_layout( title={ "text": pa["title"], "yanchor": "top", "font": { "size": float(pa["title_size_value"]) } }) cols = list(fig['layout']['xaxis']['ticktext']) rows = list(fig['layout']['yaxis']['ticktext']) df_ = pd.DataFrame({"i": range(len(rows))}, index=rows) df_ = df_.sort_values(by=["i"], ascending=False) df_ = df_.drop(["i"], axis=1) df_ = pd.merge(df_, tmp, how="left", left_index=True, right_index=True) df_ = df_[cols] clusters_cols_ = pd.DataFrame({"col": cols}) if pa_["col_cluster"]: clusters_cols = pd.merge(clusters_cols_, clusters_cols, on=["col"], how="left") else: clusters_cols = clusters_cols_ clusters_rows_ = pd.DataFrame({"col": df_.index.tolist()}) if pa_["row_cluster"]: clusters_rows = pd.merge(clusters_rows_, clusters_rows, on=["col"], how="left") else: clusters_rows = clusters_rows_ df_.reset_index(inplace=True, drop=False) cols = df_.columns.tolist() cols[0] = "rows" df_.columns = cols return fig, clusters_cols, clusters_rows, df_
# print dendrogram dend = dendrogram(linkage(distanceMatrix, method='complete'), color_threshold=1, leaf_font_size=10, labels=df.teamID.tolist()) # This give us 7 clusters # let's set the cutoff at 2 for 4 clusters dend = dendrogram(linkage(distanceMatrix, method='complete'), color_threshold=2, leaf_font_size=10, labels=df.teamID.tolist()) # get cluster assignments assignments = fcluster(linkage(distanceMatrix, method='complete'), 2, 'distance') cluster_output = pandas.DataFrame({ 'team': df.teamID.tolist(), 'cluster': assignments }) # change the colors of the graph colors = cluster_output.cluster colors[colors == 1] = 'b' colors[colors == 2] = 'g' colors[colors == 3] = 'r' colors[colors == 4] = 'y' # Plot plt.scatter(df.total_salaries, df.total_wins, s=100, c=colors, lw=0)
def hierarchical(data=None, k=0, linkage='average', metric='euclidean', metric_args=None): """Perform clustering using hierarchical agglomerative algorithms. Parameters ---------- data : array An m by n array of m data samples in an n-dimensional space. k : int, optional Number of clusters to extract; if 0 uses the life-time criterion. linkage : str, optional Linkage criterion; one of 'average', 'centroid', 'complete', 'median', 'single', 'ward', or 'weighted'. metric : str, optional Distance metric (see 'biosppy.metrics'). metric_args : dict, optional Additional keyword arguments to pass to the distance function. Returns ------- clusters : dict Dictionary with the sample indices (rows from 'data') for each found cluster; outliers have key -1; clusters are assigned integer keys starting at 0. Raises ------ TypeError If 'metric' is not a string. ValueError When the 'linkage' is unknown. ValueError When 'metric' is not 'euclidean' when using 'centroid', 'median', or 'ward' linkage. ValueError When 'k' is larger than the number of data samples. """ # check inputs if data is None: raise TypeError("Please specify input data.") if linkage not in [ 'average', 'centroid', 'complete', 'median', 'single', 'ward', 'weighted' ]: raise ValueError("Unknown linkage criterion '%r'." % linkage) if not isinstance(metric, six.string_types): raise TypeError("Please specify the distance metric as a string.") N = len(data) if k > N: raise ValueError("Number of clusters 'k' is higher than the number" \ " of input samples.") if metric_args is None: metric_args = {} if linkage in ['centroid', 'median', 'ward']: if metric != 'euclidean': raise TypeError("Linkage '{}' requires the distance metric to be" \ " 'euclidean'.".format(linkage)) Z = sch.linkage(data, method=linkage) else: # compute distances D = metrics.pdist(data, metric=metric, **metric_args) # build linkage Z = sch.linkage(D, method=linkage) if k < 0: k = 0 # extract clusters if k == 0: # life-time labels = _life_time(Z, N) else: labels = sch.fcluster(Z, k, 'maxclust') # get cluster indices clusters = _extract_clusters(labels) return utils.ReturnTuple((clusters, ), ('clusters', ))
def step5(max_d): global eventL, notCombineRDDL, resultEventL, resultRDDL, outputPath, specialNum #vectorize the text vectorizer = CountVectorizer(analyzer="word", tokenizer=my_tokenizer, preprocessor=None, stop_words=['*'], max_features=10000) train_data_features = vectorizer.fit_transform(eventL) train_data_features = train_data_features.toarray() #hierarchical clustering Z = linkage(train_data_features, 'complete', 'cityblock') #c, coph_dists = cophenet(Z, pdist(train_data_features)) #print 'The goodness of cluster result:', c clusters = fcluster(Z, max_d, criterion='distance') #initialize RDD list and Event list resultEventLL = [] resultRDDLL = [] numCombinedEvents = max(clusters) for i in range(numCombinedEvents): resultRDDLL.append([]) resultEventLL.append([]) #Put event/RDD that belong to the same cluster into the same list currentEventNum = 0 for clusterNum in clusters: resultRDDLL[clusterNum - 1].append(notCombineRDDL[currentEventNum]) resultEventLL[clusterNum - 1].append(eventL[currentEventNum]) currentEventNum += 1 #Merge the event/RDD in the same list for sameEventL in resultEventLL: if len(sameEventL) == 1: resultEventL.append(sameEventL[0]) else: combinedEvent = sameEventL[0].strip().split() count = 0 for currentEvent in sameEventL: if count == 0: count += 1 continue else: combinedEvent = LCS(combinedEvent, currentEvent.strip().split()) count += 1 resultEventL.append(' '.join(combinedEvent)) for sameRDDL in resultRDDLL: if len(sameRDDL) == 1: resultRDDL.append(sameRDDL[0]) else: resultRDDL.append(sc.union(sameRDDL)) resultRDDL[-1].map(lambda (ID, log): ID).saveAsTextFile( outputPath + str(len(resultRDDL) + specialNum))
def cluster_ssh(sla, lat, lon, nclusters, distthres=3000, returnall=False): # Remove All NaN Points ntime, nlat, nlon = sla.shape slars = sla.reshape(ntime, nlat * nlon) okdata, knan, okpts = proc.find_nan(slars, 0) npts = okdata.shape[1] # --------------------------------------------- # Calculate Correlation and Covariance Matrices # --------------------------------------------- srho = np.corrcoef(okdata.T, okdata.T) scov = np.cov(okdata.T, okdata.T) srho = srho[:npts, :npts] scov = scov[:npts, :npts] # -------------------------- # Calculate Distance Matrix # -------------------------- lonmesh, latmesh = np.meshgrid(lon, lat) coords = np.vstack([lonmesh.flatten(), latmesh.flatten()]).T coords = coords[okpts, :] coords1 = coords.copy() coords2 = np.zeros(coords1.shape) coords2[:, 0] = np.radians(coords1[:, 1]) # First point is latitude coords2[:, 1] = np.radians(coords1[:, 0]) # Second Point is Longitude sdist = haversine_distances(coords2, coords2) * 6371 # -------------------------- # Combine the Matrices # -------------------------- a_fac = np.sqrt( -distthres / (2 * np.log(0.5))) # Calcuate so exp=0.5 when distance is 3000km expterm = np.exp(-sdist / (2 * a_fac**2)) distance_matrix = 1 - expterm * srho # -------------------------- # Do Clustering (scipy) # -------------------------- cdist = squareform(distance_matrix, checks=False) linked = linkage(cdist, 'weighted') clusterout = fcluster(linked, nclusters, criterion='maxclust') # ------------------------- # Calculate the uncertainty # ------------------------- uncertout = np.zeros(clusterout.shape) for i in range(len(clusterout)): covpt = scov[i, :] # cid = clusterout[i] # covin = covpt[np.where(clusterout == cid)] covout = covpt[np.where(clusterout != cid)] uncertout[i] = np.mean(covin) / np.mean(covout) # Apply rules from Thompson and Merrifield (Do this later) # if uncert > 2, set to 2 # if uncert <0.5, set to 0 #uncertout[uncertout>2] = 2 #uncertout[uncertout<0.5] = 0 # ----------------------- # Replace into full array # ----------------------- clustered = np.zeros(nlat * nlon) * np.nan clustered[okpts] = clusterout clustered = clustered.reshape(nlat, nlon) cluster_count = [] for i in range(nclusters): cid = i + 1 cnt = (clustered == cid).sum() cluster_count.append(cnt) print("Found %i points in cluster %i" % (cnt, cid)) uncert = np.zeros(nlat * nlon) * np.nan uncert[okpts] = uncertout uncert = uncert.reshape(nlat, nlon) if returnall: return clustered, uncert, cluster_count, srho, scov, sdist, distance_matrix return clustered, uncert, cluster_count
# Hierarchical Clustering Y = sch.linkage(matrix, method=m) # Cut-off n_ = [] # Number of clusters we want to obtain cluster_size = 4 # We try different thresholds cutoff_range = np.linspace(Y[:,2].max()/2., Y[:,2].min(), 50) is_csize_reached = False for t in cutoff_range: # We cutoff the dendrogram using threshold t, obtaining labels cl = sch.fcluster(Y, t, 'distance') # No. of clusters n_cl = np.unique(cl)[-1] # If our cluster number is reached we save the labels if (n_cl >= cluster_size) and (is_csize_reached == False): is_csize_reached = True t_color = t cluster_labels.append(cl) # n_ mantains the no. of clusters for each threshold n_.append(n_cl) # if cluster_size is not reached we save last clustering if is_csize_reached == False: t_color = t
def _hclust(linkmat, nclusters): threshold = _get_threshold(linkmat, nclusters) t = fcluster(linkmat, threshold, criterion='distance') return Partition(t)
def visualize_heatmap(topic_model, topics: List[int] = None, top_n_topics: int = None, n_clusters: int = None, width: int = 800, height: int = 800) -> go.Figure: """ Visualize a heatmap of the topic's similarity matrix Based on the cosine similarity matrix between topic embeddings, a heatmap is created showing the similarity between topics. Arguments: topic_model: A fitted BERTopic instance. topics: A selection of topics to visualize. top_n_topics: Only select the top n most frequent topics. n_clusters: Create n clusters and order the similarity matrix by those clusters. width: The width of the figure. height: The height of the figure. Returns: fig: A plotly figure Usage: To visualize the similarity matrix of topics simply run: ```python topic_model.visualize_heatmap() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_heatmap() fig.write_html("path/to/file.html") ``` """ # Select topic embeddings if topic_model.topic_embeddings is not None: embeddings = np.array(topic_model.topic_embeddings) else: embeddings = topic_model.c_tf_idf # Select topics based on top_n and topics args if topics is not None: topics = list(topics) elif top_n_topics is not None: topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1]) else: topics = sorted(list(topic_model.get_topics().keys())) # Order heatmap by similar clusters of topics if n_clusters: if n_clusters >= len(set(topics)): raise ValueError("Make sure to set `n_clusters` lower than " "the total number of unique topics.") embeddings = embeddings[[topic + 1 for topic in topics]] distance_matrix = cosine_similarity(embeddings) Z = linkage(distance_matrix, 'ward') clusters = fcluster(Z, t=n_clusters, criterion='maxclust') # Extract new order of topics mapping = {cluster: [] for cluster in clusters} for topic, cluster in zip(topics, clusters): mapping[cluster].append(topic) mapping = [cluster for cluster in mapping.values()] sorted_topics = [topic for cluster in mapping for topic in cluster] else: sorted_topics = topics # Select embeddings indices = np.array([topics.index(topic) for topic in sorted_topics]) embeddings = embeddings[indices] distance_matrix = cosine_similarity(embeddings) # Create nicer labels new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] fig = px.imshow(distance_matrix, labels=dict(color="Similarity Score"), x=new_labels, y=new_labels, color_continuous_scale='GnBu' ) fig.update_layout( title={ 'text': "<b>Similarity Matrix", 'y': .95, 'x': 0.55, 'xanchor': 'center', 'yanchor': 'top', 'font': dict( size=22, color="Black") }, width=width, height=height, hoverlabel=dict( bgcolor="white", font_size=16, font_family="Rockwell" ), ) fig.update_layout(showlegend=True) fig.update_layout(legend_title_text='Trend') return fig
def evaluate_distance_matrix(distanceMatrix, trueClusters, clusteringType, **kwargs): # TODO: 1. clear blackList dependency # 2. clustering type is an unlucky name for betaCV and the like. trueClusterNum = len(np.unique(trueClusters)) # distanceMatrixCopy = np.copy(distanceMatrix) if clusteringType == 'all' or 'betaCV' in clusteringType: res = beta_cv(distanceMatrix, trueClusters, blackList=None, ranks=False) print "Beta-CV = %f" % (res, ) if clusteringType == 'all' or 'cIndex' in clusteringType: res = c_index(distanceMatrix, trueClusters, blackList=None) print "C-Index = %f" % (res, ) if clusteringType == 'all' or 'silhouette' in clusteringType: print "Silhouette = %f" % (metrics.silhouette_score( distanceMatrix, trueClusters, metric='precomputed'), ) if clusteringType == 'all' or 'hierarchical' in clusteringType: print "\nEvaluating **Hierarchical Clustering**" distArray = ssd.squareform(distanceMatrix) try: linkageFunction = kwargs['linkage'] except: linkageFunction = "complete" print "Linkage = " + linkageFunction Z = hierarchy.linkage(distArray, method=linkageFunction) T = hierarchy.fcluster(Z, trueClusterNum, criterion="maxclust") if len(np.unique(T)) != trueClusterNum: print "!Clusters found: " + str(len(np.unique(T))) res = evaluate_unsup_clustering(trueClusters, T, None, verbose=True) if clusteringType == 'all' or 'affinity' in clusteringType: print "\nEvaluating **Affinity Propagation**" affinities = np.exp(-(distanceMatrix**2) / (2 * (np.median(distanceMatrix)**2))) cluster_centers_indices, labels = sklearn_cluster.affinity_propagation( affinities, copy=False, verbose=True) res = evaluate_unsup_clustering(trueClusters, labels, len(cluster_centers_indices), verbose=True) if clusteringType == 'all' or "dbscan" in clusteringType: print "\nEvaluating **DBScan Clustering**" # TODO maybe adapt eps eps = np.percentile(distanceMatrix, 5) predictedLabels = sklearn_cluster.DBSCAN( eps, metric='precomputed').fit_predict(distanceMatrix) print "Predicted as Noise: " + str(np.sum(predictedLabels == -1)) res = evaluate_unsup_clustering(trueClusters, predictedLabels, len(np.unique(predictedLabels)), verbose=True) if clusteringType == 'all' or "spectral" in clusteringType: print "\nEvaluating **Spectral (with Normalized Laplacian) Clustering**" affinities = np.exp(-(distanceMatrix**2) / (2 * (np.median(distanceMatrix)**2))) # arpack was chosen for stability reasons. classifier = sklearn_cluster.SpectralClustering( n_clusters=trueClusterNum, affinity='precomputed', assign_labels='kmeans', eigen_solver='arpack') classifier.fit(affinities) res = evaluate_unsup_clustering(trueClusters, classifier.labels_, None, verbose=True) # assert(np.all(distanceMatrixCopy == distanceMatrix)) return res
def compare_methods(corr_matrix=None, beta=None, Q=None, p=None, n=500, q=0.25, num_data_samples=10, link_methods=['average'], S_methods=None, split=True, sample_kwargs={ 'coeff_size': 10, }, feature_fns={'LCD': lasso_statistic}, feature_fn_kwargs={}, S_kwargs={ 'objective': 'norm', 'norm_type': 'fro' }, copies=1, seed=110, reduction=None, time0=None, scache_only=False, num_processes=8, compute_split_oracles=True, noSDPcalc=False, onlyoracles=False): """ S_methods arg optionally allows you to add extra kwargs (e.g. ASDP instead of SDP) for each link method. Should be a list of tuples, of the form [(methodname, method_kwargs)], and it should be the same length as link_methods. scache_only: If True, only compute the S_group matrices, then stop. noSDP: If True, don't compute any SDP formulations. """ # Timing if time0 is None: time0 = time.time() # Get p, Q, reduction if corr_matrix is not None: p = corr_matrix.shape[0] if Q is None and corr_matrix is not None: Q = knockadapt.utilities.chol2inv(corr_matrix) if reduction is None: reduction = 10 # Sample data for the first time, create links. # We set set the seed here for two reasons: # (1) This X and y are not actually used for anything # (2) In the case where we are generating the corr_matrix, # we want this to be reproducible. if seed is not None: np.random.seed(seed) X, y, beta2, Q2, corr_matrix2 = knockadapt.graphs.sample_data( n=n, p=p, corr_matrix=corr_matrix, Q=Q, beta=beta, **sample_kwargs) # Make sure we aren't changing the DGP if corr_matrix is None: corr_matrix = corr_matrix2 if beta is None: beta = beta2 if Q is None: Q = Q2 test_DGP_consistency(beta, beta2, corr_matrix, corr_matrix2, Q, Q2) # Sometimes the link methods are the same because we're also comparing # S generation methods (e.g. ASDP vs SDP), so might have to rename them link_method_dict = {} if S_methods is not None: for i in range(len(link_methods)): methodname = S_methods[i][0] oldname = link_methods[i] new_name = methodname + "_" + oldname link_methods[i] = new_name link_method_dict[new_name] = oldname # Create links, groups, cutoffs links = { link_method: knockadapt.graphs.create_correlation_tree( corr_matrix, method=link_method_dict[link_method]) for link_method in link_methods } # Dictionary storing cutoff lists for each link method all_cutoffs = {} for link_method in link_methods: link = links[link_method] # Max size refers to maximum group size cutoffs = knockadapt.adaptive.create_cutoffs(link=link, reduction=reduction, max_size=100) all_cutoffs[link_method] = cutoffs # Dictionary of dictionaries (link by cutoff) which stores group sizes all_Ms = {} # Dictionary of dictionaries (link by cutoff) which stores groupings all_groups = {} for link_method in link_methods: # Graph cutoffs, links cutoffs = all_cutoffs[link_method] link = links[link_method] # Create groups for each cutoff link_groups = {} Ms = {} for cutoff in cutoffs: groups = hierarchy.fcluster(link, cutoff, criterion="distance") link_groups[cutoff] = groups Ms[cutoff] = np.unique(groups).shape[0] # Add smaller dictionaries to parent dictionaries all_groups[link_method] = link_groups all_Ms[link_method] = Ms # Create S matrices: dictionary of dictionaries (link by cutoff) # This is a bit hacky, but we can associate a different S_method # with each link method if we want. if S_methods is None: S_methods = [{} for _ in link_methods] S_matrixes = {link_method: {} for link_method in link_methods} # Assemble the list of parameters to pass to the multiprocessing module all_arguments = [] for link_method, S_method in zip(link_methods, S_methods): # Retrive groups/cutoffs for this link method link_method_groups = all_groups[link_method] cutoffs = all_cutoffs[link_method].copy() # Progress report sys.stdout.write( f'Generating/retreiving S matrices for {link_method} now, time is {time.time() - time0}\n' ) # Add S matrixes for cutoff in cutoffs: groups = link_method_groups[cutoff] # Possibly load from text file S_group = load_S_matrix(p, seed, cutoff, link_method, sample_kwargs) if S_group is not None: sys.stdout.write( f'S for {link_method} {np.around(cutoff, 3)} is preloaded, time is {time.time() - time0}\n' ) S_matrixes[link_method][cutoff] = S_group else: # If noSDP, don't bother to compute, # just get rid of the particular cutoff if noSDPcalc: # Only remove SDP operations (expensive) if S_method[0] == 'SDP': remove_flag = True else: remove_flag = False # Then remove if remove_flag: # Delete cutoff from cutoffs, start by # making report time1 = time.time() - time0 sys.stdout.write( f'Cutoff {np.around(cutoff, 3)} for {link_method} is being removed since noSDPcalc = True, time is {time1}\n' ) # Now actually delete which_to_delete = np.where( all_cutoffs[link_method] == cutoff) all_cutoffs[link_method] = np.delete( arr=all_cutoffs[link_method], obj=which_to_delete, axis=0) # And don't add arguments continue # If it hasn't been removed, add this to # list of arguments to pass to pool all_arguments.append( (S_group, link_method, cutoff, time0, X, corr_matrix, Q, groups, S_kwargs, S_method, p, seed, sample_kwargs)) # Pass to multiprocessor if num_processes == 1: all_S_outputs = [] for arguments in all_arguments: all_S_outputs.append(compute_S_matrix(*arguments)) else: with Pool(num_processes) as thepool: all_S_outputs = thepool.starmap(compute_S_matrix, all_arguments) for (S_group, link_method, cutoff) in all_S_outputs: S_matrixes[link_method][cutoff] = S_group if scache_only: sys.stdout.write( f'Terminating early because scache_only is true, time is {time.time() - time0} \n' ) return None # Construct oracle (curse of dimensionality applies here) feature_methods = [fname for fname in feature_fns] for fname in feature_methods: if fname not in feature_fn_kwargs: feature_fn_kwargs[fname] = {} oracle_results = pd.DataFrame(columns=ORACLE_COLUMNS) # Helper function which will be used for multiprocessing ---------------------- partial_eval_oracles = partial(eval_oracles, n=n, p=p, q=q, X=X, y=y, corr_matrix=corr_matrix, Q=Q, beta=beta, sample_kwargs=sample_kwargs, link_methods=link_methods, feature_fns=feature_fns, feature_fn_kwargs=feature_fn_kwargs, all_cutoffs=all_cutoffs, all_groups=all_groups, S_matrixes=S_matrixes, time0=time0, copies=copies, compute_split_oracles=compute_split_oracles) # End helper function --------------------------- sys.stdout.write("Picking the best oracles!\n") # Don't use the pool object if n-processes is 1 if num_processes == 1: all_outputs_to_add = [] for j in range(num_data_samples): all_outputs_to_add.append(partial_eval_oracles(j)) else: with Pool(num_processes) as thepool: all_outputs_to_add = thepool.map(partial_eval_oracles, list(range(num_data_samples))) # Put it all together for process_output in all_outputs_to_add: for to_add in process_output: oracle_results = oracle_results.append(to_add) # Pick best cutoffs based on mean power for each oracle all_oracle_cutoffs = {} for oracle_type in oracle_results['oracle_type'].unique(): # Create subset, calculate means subset_results = oracle_results.loc[oracle_results['oracle_type'] == oracle_type] mean_powers = subset_results.groupby( ['feature_fn', 'link_method', 'cutoff'])['power'].mean() # Take max and save oracle_cutoffs = mean_powers.unstack().idxmax(1).unstack() all_oracle_cutoffs[oracle_type] = oracle_cutoffs sys.stdout.write( f'Finished creating oracles: comparing methods, time is {time.time() - time0}\n' ) if onlyoracles: sys.stdout.write( f'Returning early because onlyoracles is true (not doing more computation)\n' ) return None, oracle_results, S_matrixes # Initialize output to actually compare methods output_df = pd.DataFrame(columns=FINAL_COLUMNS) # Create helper function for multiprocessing partial_one_sample_comparison = partial( one_sample_comparison, n=n, p=p, q=q, X=X, y=y, corr_matrix=corr_matrix, Q=Q, beta=beta, sample_kwargs=sample_kwargs, links=links, all_oracle_cutoffs=all_oracle_cutoffs, link_methods=link_methods, feature_fns=feature_fns, feature_fn_kwargs=feature_fn_kwargs, all_cutoffs=all_cutoffs, all_groups=all_groups, S_matrixes=S_matrixes, time0=time0, copies=copies, reduction=reduction) # Don't use pool object if num_processes == 1 if num_processes == 1: comparisons_to_add = [] for j in range(num_processes): comparisons_to_add.append(partial_one_sample_comparison(j)) else: with Pool(num_processes) as thepool: comparisons_to_add = thepool.map(partial_one_sample_comparison, list(range(num_data_samples))) sys.stdout.write( f'Finished: now just combining outputs, time is {time.time() - time0}\n' ) # Combine outputs for list_to_add in comparisons_to_add: for to_add in list_to_add: output_df = output_df.append(to_add, ignore_index=True) return output_df, oracle_results, S_matrixes
return (agree_same + disagree_same) / float(count) # Code Sample import scipy.cluster.hierarchy as sch import numpy as np import pylab as pl # Plot dendogram and cut the tree to find resulting clusters fig = pl.figure() data = np.array([[1, 2, 3], [1, 1, 1], [5, 5, 5]]) datalable = ['first', 'second', 'third'] hClsMat = sch.linkage(data, method='complete') # Complete clustering sch.dendrogram(hClsMat, labels=datalable, leaf_rotation=45) fig.savefig("thing.pdf") resultingClusters = sch.fcluster(hClsMat, t=3, criterion='distance') print resultingClusters # Your code starts from here .... # 1. # Scaling min max # STUDENT CODE TODO # 2. # K-means http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html # STUDENT CODE TODO # 3. # Compute Rand Index # STUDENT CODE TODO
'''We are going to continue the investigation into the sightings of legendary Pokémon from the previous exercise. Remember that in the scatter plot of the previous exercise, you identified two areas where Pokémon sightings were dense. This means that the points seem to separate into two clusters. In this exercise, you will form two clusters of the sightings using hierarchical clustering. 'x' and 'y' are columns of X and Y coordinates of the locations of sightings, stored in a Pandas data frame, df. The following are available for use: matplotlib.pyplot as plt, seaborn as sns, and pandas as pd.''' import pandas as pd x = [9, 6, 2, 3, 1, 7, 1, 6, 1, 7, 23, 26, 25, 23, 21, 23, 23, 20, 30, 23] y = [8, 4, 10, 6, 0, 4, 10, 10, 6, 1, 29, 25, 30, 29, 29, 30, 25, 27, 26, 30] df = pd.DataFrame({'x':x,'y':y}) # Import linkage and fcluster functions from scipy.cluster.hierarchy import linkage, fcluster # Use the linkage() function to compute distances Z = linkage(df, 'ward') # Generate cluster labels df['cluster_labels'] = fcluster(Z, 2, criterion='maxclust') # Plot the points with seaborn import matplotlib.pyplot as plt import seaborn as sns sns.scatterplot(x='x', y='y', hue='cluster_labels', data=df) plt.show()
plt.rcParams['font.size'] = 14 #フォントサイズを設定 dendrogram(result, labels=namelist) plt.ylabel("distance") #plt.show() #plt.savefig("/home/kei/document/experiments/Master/UJ_result/elder.png") plt.cla() NUM_CLUSTERS_RANGE = range(2, 24) silhouette_coefficient = [] davies_bouldin_index = [] plt.xlabel('Number of Clusters') plt.ylabel('Silhouette Coefficient') plt.rcParams["ytick.direction"] = "in" plt.rcParams["xtick.direction"] = "in" for num in NUM_CLUSTERS_RANGE: labels = fcluster(result, t=num, criterion='maxclust') silhouette_coefficient.append( silhouette_score(Distance, labels, metric='precomputed')) davies_bouldin_index.append(davies_bouldin_score(Distance, labels)) p0, = plt.plot(NUM_CLUSTERS_RANGE, silhouette_coefficient, 'bo-', label='Silhouette Coefficient') #p2, = par2.plot(NUM_CLUSTERS_RANGE, davies_bouldin_index, 'gs-', label='Davies Bouldin Index') #par2.set_ylabel('Davies Bouldin Index') lines = [p0] """ plt.legend(lines, [l.get_label() for l in lines], fontsize=10,
def fastlinkage(dmat): return sch.fcluster(fc.linkage(squareform(dmat),method='single'),0.01)
# In[66]: den.keys()#dict_keys(['icoord', 'dcoord', 'ivl', 'leaves', 'color_list']) len(den['ivl']) #den['ivl'] #den['leaves'] # In[67]: from scipy.cluster.hierarchy import fcluster assignments = fcluster(linked,max_d,'distance') print(assignments) assignments_series = pd.Series(assignments) assignments_series.value_counts() # ## Hierarchical Clustering - dendrogram - Cluster # In[68]: leaves_dataframe = pd.DataFrame({"leaves":den['leaves']}) assignments_dataframe = pd.DataFrame({"assignments":assignments})
def scipyLinkage(dmat): return sch.fcluster(sch.single(dmat),0.01)
samples = order.sort_values('x')['Sample ID'].tolist() ax1.yaxis.set_visible(False) ax1.xaxis.set_visible(False) ax1.tick_params(left=False, bottom=False) ax1.spines['left'].set_visible(False) ax1.spines['bottom'].set_visible(False) # ============================================================================= # Plot clusters # ============================================================================= cmap = plt.get_cmap('tab10') k = len(list(set(dn_data.get('color_list')))) T = fcluster(ln, k, 'maxclust') # calculate labels for index, row in order.iterrows(): order.at[index, 'cluster'] = T[row['i']] order.at[index, 'cluster_color'] = matplotlib.colors.to_hex(cmap(T[row['i']])) order = order.merge(tc[['Sample ID', 'Final tNGS_TC', 'tc_color']], on='Sample ID') ax2.bar(order['x'], 0.67, bottom=0.33, color=order['cluster_color']) ax2.set_xlim(-0.5, len(samples) - 0.5) ax2.set_yticks([0.66]) ax2.set_ylim(0, 1)
def fcLinkage(dmat): return sch.fcluster(fc.linkage(dmat,method='single'),0.01)
def hierarchical_clustering(filters, threshold=1.0): Dist = distance_matrix(filters) Z = hc.linkage(Dist, method='complete') clusters = hc.fcluster(Z, t=threshold, criterion='distance') return clusters, Dist
consommation.iloc[:, i], consommation.iloc[:, j], k) DM_GCC = pd.DataFrame(DM_GCC, index=consommation.columns, columns=consommation.columns) # sns.clustermap(consommation, col_linkage=hcl.linkage(squareform(DM_GCC))) plt.figure() hcl.dendrogram(hcl.linkage(squareform(DM_GCC), method="average")) plt.figure() plt.plot( np.arange(.1, 1.1, .1), np.array([ np.unique( hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"), t=t, criterion="distance")).shape[0] for t in np.arange(0.1, 1.1, 0.1) ])) hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"), t=0.4, criterion="distance") n_clusters = 5 clusters = hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"), t=n_clusters, criterion="maxclust") from sklearn.decomposition import PCA pca = PCA(n_components=4)
def _cluster_by_distance(linkage_table: numpy.ndarray, cutoff: float) -> numpy.ndarray: """ Try to infer a good distance cutoff by detecting the first changepoint in the sorted array of distances.""" clusters = hierarchy.fcluster(linkage_table, t = cutoff, criterion = 'distance') return clusters
def plot_arr_dendrogram(abs_corr_array, names, max_dist_cluster, measures=None): """ Compute dendrogram and create a plot plotting dendrogram and abs_corr_array Parameters: ---------- abs_corr_array : ndarray array containing the correlation matrix names : list list of strings containing the names of the operations in abs_corr_array in the corresponding order. max_dist_cluster : float Maximum distance in the clusters measures : ndarray (n_measures x abs_corr_array.shape[0]) Array containing measures to be plotted on top of the matrix. Positions corresponding positions of operations in abs_corr_array. Returns: -------- index : list list of indices used to reorder the correlation matrix """ figsize = (18, 12) #figsize=(46.81,33.11) rect_measures = [0.25, 0.8075, 0.5, 0.15] rect_dendro = [0.755, 0.05, 0.15, 0.75] rect_matrix = [0.25, 0.05, 0.5, 0.75] rect_color = [0.92, 0.05, 0.02, 0.75] # Compute and plot dendrogram. fig = plt.figure(figsize=figsize) axdendro = fig.add_axes(rect_dendro) corr_linkage = idtop.calc_linkage(abs_corr_array)[0] corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation='left', color_threshold=max_dist_cluster) #axdendro.set_xticks([]) axdendro.set_yticks([]) axdendro.axvline(max_dist_cluster, ls='--', c='k') axdendro.set_xlabel('correlation distance') # Plot distance matrix. axmatrix = fig.add_axes(rect_matrix) index = corr_dendrogram['leaves'] abs_corr_array = abs_corr_array[index, :] abs_corr_array = abs_corr_array[:, index] # -- plot the correlation matrix vmin = round(np.min(abs_corr_array), 1) vmax = 1 numSteps = (vmax - vmin) * 20 # steps of 0.05 in correlation im = axmatrix.matshow(abs_corr_array, aspect='auto', origin='lower', vmin=vmin, vmax=vmax, cmap=mpl.pyplot.cm.get_cmap('jet', numSteps)) axmatrix.set_xticks([]) axmatrix.set_yticks(range(len(index))) #axmatrix.set_yticklabels(np.array(names)[index],fontsize=5) axmatrix.set_yticklabels(np.array(names)[index]) # Plot colorbar. axcolor = fig.add_axes(rect_color) cbar = plt.colorbar(im, cax=axcolor) cbar.set_label('Pearson correlation') # Plot the quality measures axmeasure = fig.add_axes(rect_measures) axmeasure.xaxis.set_ticklabels([]) axmeasure.scatter( np.arange(0, measures.shape[-1]) + 0.5, measures[0, index]) axmeasure.set_xlim([0, measures.shape[-1]]) axmeasure.set_ylabel('problems calculated') axmeasure.yaxis.label.set_color('b') [label.set_color('b') for label in axmeasure.get_yticklabels()] axmeasure2 = axmeasure.twinx() axmeasure2.plot(np.arange(0, measures.shape[-1]) + 0.5, measures[1, index], color='r') axmeasure2.set_xlim([0, measures.shape[-1]]) [label.set_color('r') for label in axmeasure2.get_yticklabels()] axmeasure2.set_ylabel('z-scored avg classification error') axmeasure2.yaxis.label.set_color('r') # ----------------------------------------------------------------- # -- calculate and plot clusters ---------------------------------- # ----------------------------------------------------------------- #cluster_ind = hierarchy.fcluster(link_arr, t=cluster_t, criterion=cluster_criterion) cluster_ind = hierarchy.fcluster(corr_linkage, t=max_dist_cluster, criterion='distance') # -- plot delimiters for measures cluster_bounds = np.hstack((-1, np.nonzero(np.diff( cluster_ind[index]))[0], abs_corr_array.shape[0] - 1)) + 1 for bound in cluster_bounds: axmeasure.axvline(bound, linestyle='--', color='k') # -- calculate the locations for the cluster squares patch_bounds = cluster_bounds - .5 patch_sizes = np.diff(patch_bounds) cluter_square_params = tuple( ((patch_bounds[i], patch_bounds[i]), patch_sizes[i], patch_sizes[i]) for i in range(len(patch_sizes))) for cluster_square_param in cluter_square_params: axmatrix.add_patch( mpl.patches.Rectangle(cluster_square_param[0], cluster_square_param[1], cluster_square_param[2], fill=0, ec='w', lw=2)) # ----------------------------------------------------------------- # -- calculate and plot best features ----------------------------- # ----------------------------------------------------------------- best_features_marker = [] for (i, j) in zip(cluster_bounds[:-1], cluster_bounds[1:]): measures_dendr = measures[1, index] best_features_marker.append(i + np.argmin(measures_dendr[i:j])) axmatrix.scatter(best_features_marker, best_features_marker, color='w') axmatrix.set_xlim([-0.5, abs_corr_array.shape[0] - 0.5]) axmatrix.set_ylim([-0.5, abs_corr_array.shape[0] - 0.5]) [(text.set_color('k'), text.set_weight('bold')) for i, text in enumerate(axmatrix.get_yticklabels()) if i in best_features_marker] return index
def heatmap(adata, pathway_genes, num_clust, name, norm=False, leg_axes=(1.3, 1.3), leg_cols=1): '''We group the leiden clusters based on similarity of expression of specific genes in a pathway. Arguments : adata : the AnnData gene expression matrix pathway_genes : a list of the genes in the pathway num_clust : the optimal number of clusters based on silhouette score on cosine distance name : the name with which we want to label the clusters of this pathway leg_axes : we can change the coordinates of the legend Returns: AnnData object labeled with the pathway clusters. Return at Index 0: Clustermap Figure you can later save Return at Index 1: A dataframe of all the gene expression values ''' if norm: df = gene_expression_norm(adata, pathway_genes) else: df = gene_expression(adata, pathway_genes) d = sch.distance.pdist(df.transpose(), metric='cosine') L = sch.linkage(d) linkage = sch.fcluster(L, num_clust, 'maxclust') str_linkage = [] for i in linkage: str_linkage.append(str(i)) new_dict = dict( zip([str(i) for i in range(0, len(str_linkage))], str_linkage)) adata.obs[name] = adata.obs['leiden'].replace(new_dict) if norm: df = gene_expression_norm(adata, pathway_genes) else: df = gene_expression(adata, pathway_genes) cols = {} for j in list(df.columns): cols[j] = str(adata[adata.obs['leiden'] == j].obs[name][0]) cols = pd.Series(data=cols, name='Clusters') labels = adata.obs[name].unique() labels = list(map(str, labels)) cmap = plt.get_cmap('Paired') colors = cmap(np.linspace(0, 1, len(labels))) lut1 = dict(zip(labels, colors)) cols_to_return = [] keys_for_colors = list(lut1.keys()) keys_for_colors.sort() for k in keys_for_colors: cols_to_return.append(lut1[k]) adata.uns[name + '_colors'] = cols_to_return row_colors1 = cols.map(lut1) g = sns.clustermap(df, metric='cosine', row_cluster=False, cmap='viridis', col_linkage=L, col_colors=row_colors1, figsize=(6, 6)) ax = g.ax_heatmap legend_elements = [] keys = list(lut1.keys()) keys.sort() for j in keys: legend_elements.append( Line2D([0], [0], marker='s', label=j, color=lut1[j])) #ax.legend(handles = legend_elements, title = 'Clusters', fontsize='small', #loc='upper right', bbox_to_anchor=(1.4,1.3), ncol=leg_cols) g.fig.suptitle((name + ' with ' + str(num_clust) + ' clusters'), y=1.0, x=0.5, fontsize='large') ax.set_xlabel('Leiden clustering', x=0.5) return g.fig, df
def _cluster_by_inconsistent(linkage_table: numpy.ndarray, cutoff: float, inconsistent: pandas.DataFrame) -> numpy.ndarray: clusters = hierarchy.fcluster(linkage_table, t = cutoff, criterion = 'inconsistent', R = inconsistent.values) return clusters