def snpclust(snp_fn, dist_fn, snpout_fn, clust_fn, thr=0.01): # read SNP and distance matrix files snp = pd.read_csv(snp_fn, sep=',', index_col=0) dist = pd.read_csv(dist_fn, sep=',', index_col=0) # hierarchical clustering Z = hierarchy.complete(squareform(dist)) clust_ids = hierarchy.fcluster(Z, t=thr, criterion='distance') # compute the cluster representatives and build the cluster dictionary clust = dict() for ci in np.unique(clust_ids): idx = np.where(ci == clust_ids)[0] if idx.shape[0] == 1: r = dist.index[idx[0]] clust[r] = [r] else: dist_sum = dist.iloc[idx, idx].sum(axis=0) clust[dist_sum.idxmin()] = dist.index[idx].tolist() # write the SNP output file containing the medoids only snp_out = pd.concat((snp["Ref"], snp[clust.keys()]), axis=1) snp_out.to_csv(snpout_fn, index_label=snp.index.name) # write the cluster file with open(clust_fn, 'wb') as clust_handle: clust_writer = csv.writer(clust_handle, delimiter='\t', lineterminator='\n') for r, ms in clust.items(): for m in ms: clust_writer.writerow([m, r, "{:.6f}".format(dist.loc[m, r])])
def create_hc2(G,t=1.15): """Creates hierarchical cluster of graph G from distance matrix This return of this function is an argument to create a blockmodel with nx.quotient_graph...because nx.blockmodel is not supported by networkx v.2.0 ---------------------------------------------- INPUT: G, instaciated networkx graph t, is the threshold for partition selection, which is arbitrarity set to t=1.15 by default. OUPUT: returns list of partition values split on hierarchical cluster""" path_length = dict(nx.all_pairs_shortest_path_length(G)) dist_matrix = np.zeros((len(G),len(G))) for u,p in path_length.items(): for v,d in p.items(): dist_matrix[u][v]=d # Create hierarchical cluster Y = distance.squareform(dist_matrix) # Creates HC using farthest point linkage Z = hierarchy.complete(Y) # This partition selection membership = list(hierarchy.fcluster(Z,t=t)) # Create collection of lists for the blockmodel partition = defaultdict(list) for n,p in zip(list(range(len(G))), membership): partition[p].append(n) return list(partition.values())
def plotSimpleDendogram(df, linkage): """ Plot simple dendogram Parameters: df (DataFrame): Transposed and Normalize Dataframes that contains sensor data linkage (str): type of linkage e.g. 'ward', 'average', 'complete', 'single' """ plt.figure(figsize=(18, 10)) if linkage == 'average': likage = average(df) elif linkage == 'ward': likage = ward(df) elif linkage == 'single': likage = single(df) elif linkage == 'complete': likage = complete(df) dendrogram(likage, labels=df.index, orientation='top', distance_sort='descending', leaf_rotation=90, leaf_font_size=12) plt.show()
def create_correlation_tree(corr_matrix, method="average"): """ Creates hierarchical clustering (correlation tree) from a correlation matrix :param corr_matrix: the correlation matrix :param method: 'single', 'average', 'fro', or 'complete' returns: 'link' of the correlation tree, as in scipy""" # Distance matrix for tree method if method == "fro": dist_matrix = np.around(1 - np.power(corr_matrix, 2), decimals=7) else: dist_matrix = np.around(1 - np.abs(corr_matrix), decimals=7) dist_matrix -= np.diagflat(np.diag(dist_matrix)) condensed_dist_matrix = ssd.squareform(dist_matrix) # Create linkage if method == "single": link = hierarchy.single(condensed_dist_matrix) elif method == "average" or method == "fro": link = hierarchy.average(condensed_dist_matrix) elif method == "complete": link = hierarchy.complete(condensed_dist_matrix) else: raise ValueError( f'Only "single", "complete", "average", "fro" are valid methods, not {method}' ) return link
def create_hc(G, t=1.0): """ Creates hierarchical cluster of graph G from distance matrix Maksim Tsvetovat ->> Generalized HC pre- and post-processing to work on labelled graphs and return labelled clusters The threshold value is now parameterized; useful range should be determined experimentally with each dataset """ """Modified from code by Drew Conway""" ## Create a shortest-path distance matrix, while preserving node labels labels = G.nodes() path_length = nx.all_pairs_shortest_path_length(G) distances = numpy.zeros((len(G), len(G))) i = 0 for u, p in path_length.items(): j = 0 for v, d in p.items(): distances[i][j] = d distances[j][i] = d if i == j: distances[i][j] = 0 j += 1 i += 1 # Create hierarchical cluster Y = distance.squareform(distances) Z = hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrive purposes membership = list(hierarchy.fcluster(Z, t=t)) # Create collection of lists for blockmodel partition = defaultdict(list) for n, p in zip(list(range(len(G))), membership): partition[p].append(labels[n]) return list(partition.values())
def _cluster_hierarchically(self, designs, verbose=False): import scipy.spatial.distance as sp_dist import scipy.cluster.hierarchy as sp_clust from itertools import combinations num_designs = len(designs) if num_designs < 2: return dist_matrix = self._get_pairwise_distance_matrix(designs) dist_vector = sp_dist.squareform(dist_matrix) mean_dist = np.mean(dist_vector) hierarchy = sp_clust.complete(dist_vector) clusters = sp_clust.fcluster(hierarchy, mean_dist, criterion='distance') for cluster, design in zip(clusters, designs): design.sequence_cluster = cluster if verbose: import pylab print("Made {} clusters.".format(len(set(clusters)))) pylab.hist(dist_vector, bins=100) pylab.axvline(mean_dist) pylab.show()
def hierarchical_clustering_average(similarity_matrix, linkage_type): """ Hierarchical Clustering with custom distance :return dendrogram from the hierarchical clustering :param similarity_matrix - matrix with weight function between classes :param linkage_type - string representing the type of linkage to be applied """ if linkage_type == 'average': hierarc = hierarchy.average(similarity_matrix) elif linkage_type == 'single': hierarc = hierarchy.single(similarity_matrix) elif linkage_type == 'complete': hierarc = hierarchy.complete(similarity_matrix) hierarchy.dendrogram(hierarc, labels=sorted(list(get_all_controller_classes())), distance_sort='descending') # Uncomment to see image instead of saving #plt.show() ################################################### # Uncomment to save the .png file of the dendrogram plab.savefig("dendrogram_" + linkage_type + "_2.png", format="png", bbox_inches='tight') # Closes the open pyplot windows so the dendrograms can be redrawn plt.close('all') return hierarc, linkage_type
def make_modules(dist, min_dist, obs_ids): # create linkage matrix using complete linkage z = complete(dist) # make tree from linkage matrix with names from dist tree = TreeNode.from_linkage_matrix(z, obs_ids) # get all tips so in the end we can check if we are done all_tips = len([i for i in tree.postorder() if i.is_tip()]) modules = set() seen = set() dist = pd.DataFrame(squareform(dist), index=obs_ids, columns=obs_ids) for node in tree.levelorder(): if node.is_tip(): seen.add(node.name) else: tip_names = frozenset( (i.name for i in node.postorder() if i.is_tip())) if tip_names.issubset(seen): continue dists = (dist.loc[tip1, tip2] > min_dist for tip1, tip2 in combinations(tip_names, 2)) if any(dists): continue else: modules.add(tip_names) seen.update(tip_names) if len(seen) == all_tips: modules = sorted(modules, key=len, reverse=True) return modules raise ValueError("Well, how did I get here?")
def _get_cluster(components, my_inds=None): if my_inds is None: my_inds = list(components.keys()) dist = distance.pdist([components[ind] for ind in my_inds]) hcomp = hierarchy.complete(dist) ll = hierarchy.leaves_list(hcomp) return ll
def create_hc(G, t=1.0): """ Creates hierarchical cluster of graph G from distance matrix Maksim Tsvetovat ->> Generalized HC pre- and post-processing to work on labelled graphs and return labelled clusters The threshold value is now parameterized; useful range should be determined experimentally with each dataset """ """Modified from code by Drew Conway""" ## Create a shortest-path distance matrix, while preserving node labels labels=G.nodes() path_length=nx.all_pairs_shortest_path_length(G) distances=numpy.zeros((len(G),len(G))) i=0 for u,p in path_length.items(): j=0 for v,d in p.items(): distances[i][j]=d distances[j][i]=d if i==j: distances[i][j]=0 j+=1 i+=1 # Create hierarchical cluster Y=distance.squareform(distances) Z=hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrive purposes membership=list(hierarchy.fcluster(Z,t=t)) # Create collection of lists for blockmodel partition=defaultdict(list) for n,p in zip(list(range(len(G))),membership): partition[p].append(labels[n]) return list(partition.values())
def find_clusters(conn_df, max_dist): distances = pdist(conn_df[["z", "y", "x"]], "chebyshev") linkage = hierarchy.complete(distances) fclusters = hierarchy.fcluster(linkage, max_dist, criterion="distance") clustered_df = conn_df.copy() clustered_df["cluster_id"] = fclusters return clustered_df
def process_hierarchy(inf, h, method): df = pd.read_csv(inf, header=0, index_col=0) df = df.fillna(0) strains = df.index df = 1 - (df / 100) df_v = ssd.squareform( df, force='tovector', checks=False) # flatten matrix to condensed distance vector if method == 'single': li = sch.single(df_v) elif method == 'complete': li = sch.complete(df_v) elif method == 'average': li = sch.average(df_v) elif method == 'weighted': li = sch.weighted(df_v) else: print('\nERROR: Please enter a valid clustering method\n') sys.exit() hclus = cut_tree( li, height=h ) # using the height (percent ID as decimal, for example), cluster OFUs from dendrogram hclus = pd.DataFrame(hclus, index=strains) hclus.ix[:, 0] += 1 # cut_tree defaults to the first 'cluster' being named "0"; this just bumps all IDs +1 return hclus
def complete_dendogram(similarity_matrix, book_names): linkage_matrix = complete( similarity_matrix ) # Define the linkage_matrix using ward clustering pre-computed distances assignments = fcluster(linkage_matrix, 3, depth=5) clusters = get_clusters_with_hierarchy(to_tree(linkage_matrix)) return [assignments, clusters]
def hierarchical_cluster(trainx): """ See the scipy.cluster.hierarchy documentation for the meanings of entries in T. The result can be plotted by calling hac.dendrogram(T). """ T = hac.complete(pdist(trainx.T) + .1) return T
def __init__(self, distance_matrix, labels_out): ''' Constructor ''' z = hac.complete(distance_matrix) hac.dendrogram(z, labels=labels_out) tree = hac.to_tree(z, False) self.nwk = self.getNewick(tree, "", tree.dist, labels_out)
def custom_dendrogram(label_type='titles', linkage_method='ward'): """ Plots a dendogram uses cosine similarity :param label_type: {'titles', 'ids'} linkage: {'ward', average} :return: None """ # Read data books = collection_reader.read_books_from_mongo() documents = collection_reader.extract_corpus(books) # Labels if label_type == 'titles': labels = [ "(" + book["book_id3"] + ") " + book["title"][:25] + ("..." if len(book["title"]) > 25 else "") for book in books ] else: labels = ["(" + book["book_id3"] + ")" for book in books] # Create term-document representation vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.7, use_idf=True) X = vectorizer.fit_transform(documents) # Cosine similarity matrix dist = 1 - cosine_similarity(X) # Define the linkage_matrix using ward clustering pre-computed distances if linkage_method == 'ward': linkage_matrix = ward(dist) elif linkage_method == 'average': linkage_matrix = average(dist) elif linkage_method == 'complete': linkage_matrix = complete(dist) else: raise Exception("Parameter linkage_method is not recognized!") # Calculate metrics # Plot dendrogram plt.subplots(figsize=(5, 5)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=labels) plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') print(ax["leaves"]) print(ax["ivl"]) # plt.tight_layout() # show plot with tight layout plt.show()
def heirarchy_cluster( matrix, threshold_RMSD = 2.0): import scipy.cluster.hierarchy as sp_clust # Complete linkage clustering link_matrix = sp_clust.complete( matrix ) # Make flat clusters at the tree point where distance threshold is met: default RMSD < 2.0 clusters = sp_clust.fcluster( link_matrix, threshold_RMSD, criterion = 'distance' ) return clusters
def order_contigs_by_hc(self): from scipy.cluster.hierarchy import complete from scipy.spatial.distance import squareform from scipy.cluster.hierarchy import dendrogram g = self.create_contig_graph() inverse_edge_weights(g) D = squareform(nx.adjacency_matrix(g).todense()) Z = complete(D) return dendrogram(Z)['leaves']
def __get_linkage_method(self, method = LinkageMethod.SINGLE): if method == LinkageMethod.SINGLE: return single(self.data) elif method == LinkageMethod.COMPLETE: return complete(self.data) elif method == LinkageMethod.AVERAGE: return average(self.data) else: return ward(self.data)
def group_tuples(items=None, val_ind=None, dist_thresh = 0.1, distance_matrix=None, metric='jaccard', linkage='complete', sp_areas=None): ''' items: a dict or list of tuples val_ind: the index of the item of interest within each tuple ''' if distance_matrix is not None: if items is not None: if isinstance(items, dict): keys = items.keys() values = items.values() elif isinstance(items, list): keys = range(len(items)) if isinstance(items[0], tuple): values = map(itemgetter(val_ind), items) else: values = items else: if isinstance(items, dict): keys = items.keys() values = items.values() elif isinstance(items, list): keys = range(len(items)) if isinstance(items[0], tuple): values = map(itemgetter(val_ind), items) else: values = items else: raise Exception('clusters is not the right type') assert items is not None, 'items must be provided' distance_matrix = compute_pairwise_distances(values, metric, sp_areas=sp_areas) if items is None: assert distance_matrix is not None, 'distance_matrix must be provided.' if linkage=='complete': lk = complete(squareform(distance_matrix)) elif linkage=='average': lk = average(squareform(distance_matrix)) elif linkage=='single': lk = single(squareform(distance_matrix)) # T = fcluster(lk, 1.15, criterion='inconsistent') T = fcluster(lk, dist_thresh, criterion='distance') n_groups = len(set(T)) groups = [None] * n_groups for group_id in range(n_groups): groups[group_id] = np.where(T == group_id+1)[0] index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0] item_groups = [[items[i] for i in g] for g in groups if len(g) > 0] return index_groups, item_groups, distance_matrix
def order_contigs_by_hc(self): from scipy.cluster.hierarchy import complete from scipy.spatial.distance import squareform from scipy.cluster.hierarchy import dendrogram g = self.create_contig_graph() inverse_edge_weights(g) D = squareform(nx.adjacency_matrix(g).todense()) Z = complete(D) return dendrogram(Z)["leaves"]
def klustering(): from sklearn.metrics.pairwise import cosine_similarity from scipy.cluster.hierarchy import ward, dendrogram, single, complete import matplotlib.pyplot as plt import matplotlib as mpl savepkl = get_id('UPLOAD_VEKTOR', session["nama_vektor"], ".pkl") with open(savepkl, 'rb') as f: tfidf_matrix = pickle.load(f) isifile = session["tmp"] df = pd.read_csv(isifile, names=['ID', 'Pertanyaan'], sep=';', lineterminator='\r') dist = 1 - cosine_similarity(tfidf_matrix) if session["linkage_method"] == 'ward': linkage_matrix = ward(dist) elif session["linkage_method"] == 'single': linkage_matrix = single(dist) else: linkage_matrix = complete(dist) ax = plt.subplots(figsize=(10, 10)) ax = dendrogram(linkage_matrix, orientation="right", labels=df['Pertanyaan'].values.astype('U')) plt.tick_params(\ axis= 'y', which='both', bottom='off', top='off', labelbottom='off') plt.tight_layout() if 'jarak' in request.form: plt.axvline(float(request.form['jarak']), color='black') t = datetime.datetime.now().time().strftime('%y%m%d%H%M%S') fname = ''.join([session['nama_vektor'], "_rev_", t]) pathfile = get_id('UPLOAD_IMAGE_HIRARKI', fname, '.png') plt.savefig(pathfile, dpi=400) return jsonify({"link": fname}) pathfile = get_id('UPLOAD_IMAGE_HIRARKI', session['nama_vektor'], '.png') plt.savefig(pathfile, dpi=400) return jsonify({'vektor': session['nama_vektor']})
def plotDendogramsLineCharts(df_list, n_rows, n_cols, figsize, linkage): """ Plot simple dendogram and line chart side by side Parameters: df_list (list of DataFrames): Transposed and Normalize Dataframes that contains sensor data n_rows (integer): the number of days you want to plot. n_cols (integer): figsize (tuple): figure size linkage (str): type of linkage e.g. 'ward', 'average', 'complete', 'single' """ #PLOTTING Dendogram and Line Chart fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=figsize) fig.subplots_adjust(wspace=.4) for ax, df in zip(axes, df_list): #PLOTTING DENDOGRAM #Compute linkage matrix if linkage == 'average': linkage_matrix = average(df) elif linkage == 'ward': linkage_matrix = ward(df) elif linkage == 'single': linkage_matrix = single(df) elif linkage == 'complete': linkage_matrix = complete(df) #Get dendrogram dendrogram(linkage_matrix, ax=ax[0], labels=df.index, orientation='left') ax[0].set_title('Dendogram', fontsize='14') ##PLOTTING LINE CHART #To plot we need to make columns as sensor rows as data points df_plot = np.transpose(df) #Reset index timestamp to change its data type df_plot.reset_index(inplace=True) #Change its data type to datatime df_plot['Timestamp'] = pd.to_datetime(df_plot['Timestamp']) #Get start date and end date start_date = str(df_plot['Timestamp'].min()) end_date = str(df_plot['Timestamp'].max()) #make it index again df_plot.set_index('Timestamp', inplace=True) ax[1].plot(df_plot) ax[1].set_title('All Sensors\n {} - {}'.format(start_date, end_date), fontsize='14') ax[2].plot(df_plot.iloc[:, 2:]) ax[2].set_title('Temperature & Vibration Sensors\n {} - {}'.format( start_date, end_date), fontsize='14')
def part2(computedTFIDF, showDendograms=False): startTime = time.time() runningTotalTime=0 print("Executing code for Part 2...\n") print("Creating and cutting single link clusters...") singleCluster = single(computedTFIDF.similarityMatrix) singleClusterCut = cut_tree(singleCluster, n_clusters=[i for i in range(0, computedTFIDF.docCount-1)]) singleClusterTime = round(time.time() - startTime, 3) runningTotalTime+=singleClusterTime print("Time: " + str(singleClusterTime) + " seconds") print("Creating list of single link clusters each document is contained in...") finalSingleClustering = singleClusterCut[len(singleClusterCut)-1] documentClusters=createDocumentCluster(finalSingleClustering, computedTFIDF) singleTrackingTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=singleTrackingTime print("Time: " + str(singleTrackingTime) + " seconds") print("Writing single link clusters to file...") writeToFile(documentClusters, 'single.txt') singleWritingTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=singleWritingTime print("Time: " + str(singleWritingTime) + " seconds") print("Creating and cutting complete link clusters...") completeCluster = complete(computedTFIDF.similarityMatrix) completeClusterCut = cut_tree(completeCluster, n_clusters=[i for i in range(0, computedTFIDF.docCount-1)]) completeClusterTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=completeClusterTime print("Time: " + str(completeClusterTime) + " seconds") print("Creating list of complete link clusters each document is contained in...") finalCompleteClustering = completeClusterCut[len(completeClusterCut)-1] completeDocumentClusters=createDocumentCluster(finalCompleteClustering, computedTFIDF) completeTrackingTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=completeTrackingTime print("Time: " + str(completeTrackingTime) + " seconds") print("Writing complete link clusters to file...") writeToFile(completeDocumentClusters, 'complete.txt') completeWritingTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=completeWritingTime print("Time: " + str(completeWritingTime) + " seconds") if showDendograms: displayDendogram(completeCluster, 'Single') displayDendogram(completeCluster, 'Complete') print('\nPart 2 Complete') print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n") return documentClusters, completeDocumentClusters
def group_clusters(clusters=None, dist_thresh=0.1, distance_matrix=None, metric='jaccard', linkage='complete'): if distance_matrix is not None: keys = range(len(distance_matrix)) if clusters is not None: values = clusters else: values = range(len(distance_matrix)) else: if isinstance(clusters, dict): keys = clusters.keys() values = clusters.values() elif isinstance(clusters, list): if isinstance(clusters[0], tuple): keys = [i for i, j in clusters] values = [j for i, j in clusters] else: keys = range(len(clusters)) values = clusters else: raise Exception('clusters is not the right type') if clusters is None: assert distance_matrix is not None, 'distance_matrix must be provided.' if distance_matrix is None: assert clusters is not None, 'clusters must be provided' distance_matrix = compute_pairwise_distances(values, metric) if linkage == 'complete': lk = complete(squareform(distance_matrix)) elif linkage == 'average': lk = average(squareform(distance_matrix)) elif linkage == 'single': lk = single(squareform(distance_matrix)) # T = fcluster(lk, 1.15, criterion='inconsistent') T = fcluster(lk, dist_thresh, criterion='distance') n_groups = len(set(T)) groups = [None] * n_groups for group_id in range(n_groups): groups[group_id] = np.where(T == group_id + 1)[0] index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0] res = [[values[i] for i in g] for g in groups if len(g) > 0] return index_groups, res, distance_matrix
def __apply_cluster_alg(cluster_data=[], alg="kmean", prior_cluster_num=2, t=0.155): pass """clustering""" if alg == "kmean": from scipy.cluster.vq import whiten cluster_data = whiten(cluster_data) from scipy.cluster.vq import kmeans, vq centroids, _ = kmeans(cluster_data, prior_cluster_num, iter=250) idx, dist = vq(cluster_data, centroids) return idx, prior_cluster_num elif alg == "spec": from sklearn import cluster from sklearn.preprocessing import StandardScaler X = cluster_data X = StandardScaler().fit_transform(X) spectral = cluster.SpectralClustering(n_clusters=prior_cluster_num, eigen_solver="arpack") spectral.fit(X) import numpy as N idx = spectral.labels_.astype(N.int) return idx, prior_cluster_num else: """hierarchical clustering http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html""" import scipy.cluster.hierarchy as hcluster """needs distance matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html""" import scipy.spatial.distance as dist distmat = dist.pdist(cluster_data, "minkowski") #'euclidean') if alg == "hflat": link = hcluster.linkage(distmat) elif alg == "hcomp": link = hcluster.complete(distmat) elif alg == "hweight": link = hcluster.weighted(distmat) elif alg == "havg": link = hcluster.average(distmat) idx = hcluster.fcluster(link, t=t, criterion="distance") import numpy as N post_cluster_num = len(N.unique(idx)) print "# of channels established:", post_cluster_num assert post_cluster_num < 64, "number of cluster too large to be biological meaningful" return idx, post_cluster_num
def diffCluster(matDist, threshold, labels, clusteringType): if clusteringType == 1: linkage_matrix = ward(matDist) elif clusteringType == 2: linkage_matrix = single(matDist) elif clusteringType == 3: linkage_matrix = complete(matDist) elif clusteringType == 4: linkage_matrix = average(matDist) else: return {} cluster_labels = fcluster(linkage_matrix, threshold) clusters_dict = defaultdict(list) for sent, cluster_id in zip(cluster_labels, labels): clusters_dict[cluster_id].append(sent) return clusters_dict
def mock_random_tree(self): np.random.seed(0) x = np.random.rand(10) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) lm = complete(dm.condensed_form()) ids = np.arange(len(x)).astype(np.str) tree = TreeNode.from_linkage_matrix(lm, ids) # initialize tree with branch length and named internal nodes for i, n in enumerate(tree.postorder(include_self=True)): n.length = 1 if not n.is_tip(): n.name = "y%d" % i return tree
def hierarchical_clustering(dist_matrix, method='complete'): if method == 'complete': Z = complete(dist_matrix) if method == 'single': Z = single(dist_matrix) if method == 'average': Z = average(dist_matrix) if method == 'ward': Z = ward(dist_matrix) fig = plt.figure(figsize=(20, 20)) dn = dendrogram(Z) plt.title(f"Dendrogram for {method}-linkage with correlation distance") plt.show() return Z
def hierarchical_clustering(distance_matrix, method): if method == 'complete': Z = complete(distance_matrix) if method == 'single': Z = single(distance_matrix) if method == 'average': Z = average(distance_matrix) if method == 'ward': Z = ward(distance_matrix) # fig = plt.figure(figsize=(16, 8)) # dn = dendrogram(Z) # plt.title(f"Dendrogram for {method}-linkage with dtw distance") # plt.show() return Z
def pc(): import random Y=[random.randint(1,30) for x in range(10) ] print distance.squareform(Y) Z=hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrive purposes print Z membership=list(hierarchy.fcluster(Z,1)) # Create collection of lists for blockmodel print 'membership',membership partition=defaultdict(list) for n,p in zip(list(range(10)),membership): partition[p].append(n) return list(partition.values())
def group(cls, project, results, threshold=0.0, use_single_linkage_by_default=False): """ Returns dict with groups, where key are group index and values are list of results indices :param project: Project object used for getting matching score function parameters :param results: list of results :param threshold: the comparison between 2 results should be above this threshold to join them in one group :param use_single_linkage_by_default: :return: {0: [1,2,3], 1: [4,5], 2: [6]} """ num_results = len(results) if num_results == 0: raise ValueError( f'Can\'t group empty results for project {project}') if num_results == 1: return {0: [0]} result_sim = np.zeros((num_results, num_results), dtype=np.float64) for i in range(num_results): for j in range(i + 1, num_results): result_sim[i, j] = Metrics.apply(project, results[i], results[j], symmetric=True) result_sim = result_sim + result_sim.T margin = 1.01 * np.max(result_sim) dists = margin - squareform(result_sim) if project.agreement_method == project.SINGLE or not project.agreement_method: linkage_matrix = single(dists) elif project.agreement_method == project.COMPLETE: linkage_matrix = complete(dists) else: if use_single_linkage_by_default: linkage_matrix = single(dists) else: raise ValueError( f'Unknown agreement method {project.agreement_method}') clusters = fcluster(linkage_matrix, t=margin - threshold, criterion='distance') groups = defaultdict(list) for i, cluster_idx in enumerate(clusters): groups[cluster_idx].append(i) return groups
def create_hc(G): """Creates hierarchical cluster of graph G from distance matrix""" path_length = nx.all_pairs_shortest_path_length(G) distances = numpy.zeros((len(G), len(G))) for u, p in path_length.items(): for v, d in p.items(): distances[u][v] = d # Create hierarchical cluster Y = distance.squareform(distances) Z = hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrive purposes membership = list(hierarchy.fcluster(Z, t=1.15)) # Create collection of lists for blockmodel partition = defaultdict(list) for n, p in zip(list(range(len(G))), membership): partition[p].append(n) return list(partition.values())
def create_hc(G): """Creates hierarchical cluster of graph G from distance matrix""" path_length=nx.all_pairs_shortest_path_length(G) distances=numpy.zeros((len(G),len(G))) for u,p in path_length.items(): for v,d in p.items(): distances[u][v]=d # Create hierarchical cluster Y=distance.squareform(distances) Z=hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrive purposes membership=list(hierarchy.fcluster(Z,t=1.15)) # Create collection of lists for blockmodel partition=defaultdict(list) for n,p in zip(list(range(len(G))),membership): partition[p].append(n) return list(partition.values())
def consensus_clustering(consensus, n_components=5): """ :param consensus: cells x cells consensus matrix :param n_components: number of clusters :return: cells x 1 labels """ print 'SC3 Agglomorative hierarchical clustering.' # condensed distance matrix cdm = dist.pdist(consensus) # hierarchical clustering (SC3: complete agglomeration + cutree) hclust = spc.complete(cdm) cutree = spc.cut_tree(hclust, n_clusters=n_components) labels = cutree.reshape(consensus.shape[0]) # Below is the hclust code for the older version, fyi # hclust = spc.linkage(cdm) # labels = spc.fcluster(hclust, n_components, criterion='maxclust') return labels, dist.squareform(cdm)
def consensus_clustering_here(consensus, n_components=5): """ :param consensus: cells x cells consensus matrix :param n_components: number of clusters :return: cells x 1 labels """ # print 'SC3 Agglomorative hierarchical clustering.' # condensed distance matrix cdm = dist.pdist(consensus) # hierarchical clustering (SC3: complete agglomeration + cutree) hclust = spc.complete(cdm) cutree = spc.cut_tree(hclust, n_clusters=n_components) labels = cutree.reshape(consensus.shape[0]) # Below is the hclust code for the older version, fyi # hclust = spc.linkage(cdm) # labels = spc.fcluster(hclust, n_components, criterion='maxclust') return labels
def create_hc(G): """Creates hierarchical cluster of graph G from distance matrix""" path_length = nx.all_pairs_shortest_path_length(G) distances = numpy.zeros((len(G), len(G))) # l1 = sorted(path_length.items(),key=lambda x: x[0]) # for u, p in l1: # l2 = sorted(p.items(),key=lambda x: x[0]) # for v, d in l2: # x = getIndexOfTuple(l1, 0, u) # y = getIndexOfTuple(l2, 0, v) # distances[x][y] = d for u, p in path_length.items(): for v, d in p.items(): distances[u][v] = d # Create hierarchical cluster Y = distance.squareform(distances) Z = hierarchy.complete(Y) # Creates HC using farthest point linkage plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') hierarchy.dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.show() # This partition selection is arbitrary, for illustrive purposes membership = list(hierarchy.fcluster(Z, t=1.15)) # Create collection of lists for blockmodel partition = defaultdict(list) for n, p in zip(list(range(len(G))), membership): partition[p].append(n) # [0, 179, 305] # print "Clustering [0, 179, 305]" # print l1[0][0], l1[179][0], l1[305][0] return list(partition.values())
def blockmodel_output(G, t=1.15): # Makes life easier to have consecutively labeled integer nodes H = nx.convert_node_labels_to_integers(G, label_attribute='label') """Creates hierarchical cluster of graph G from distance matrix""" # Create distance matrix path_length = dict(nx.all_pairs_shortest_path_length(H)) distances = np.zeros((len(H), len(H))) for u, p in path_length.items(): for v, d in p.items(): distances[u][v] = d # Create hierarchical cluster Y = distance.squareform(distances) Z = hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrative purposes membership = list(hierarchy.fcluster(Z, t=t)) # Create collection of lists for blockmodel partitions = defaultdict(list) for n, p in zip(list(range(len(G))), membership): partitions[p].append(n) # Build blockmodel graph #BM = nx.blockmodel(H, partitions) # change in nx 2.0 p_values = list(partitions.values()) BM = nx.quotient_graph(H, p_values, relabel=True) label_dict = dict([(n, H.node[n]['label']) for n in H]) order = [label_dict[item] for sublist in p_values for item in sublist] nm = nx.to_pandas_dataframe(G) nm = nm.reindex(index=order) nm.columns = nm.index ho = homophily(G, 'type') output = { 'G': G, 'H': H, 'partitions': partitions, 'BM': BM, 'nm': nm, 'label_dict': label_dict, 'order': order, 'distances': distances } output.update(ho) return output
def create_hc(G): """Creates hierarchical cluster of graph G from distance matrix""" path_length = nx.all_pairs_shortest_path_length(G) distances = numpy.zeros((len(G), len(G))) # l1 = sorted(path_length.items(),key=lambda x: x[0]) # for u, p in l1: # l2 = sorted(p.items(),key=lambda x: x[0]) # for v, d in l2: # x = getIndexOfTuple(l1, 0, u) # y = getIndexOfTuple(l2, 0, v) # distances[x][y] = d for u, p in path_length.items(): for v, d in p.items(): distances[u][v] = d # Create hierarchical cluster Y = distance.squareform(distances) Z = hierarchy.complete(Y) # Creates HC using farthest point linkage plt.figure(figsize=(25, 10)) plt.title("Hierarchical Clustering Dendrogram") plt.xlabel("sample index") plt.ylabel("distance") hierarchy.dendrogram( Z, leaf_rotation=90.0, leaf_font_size=8.0 # rotates the x axis labels # font size for the x axis labels ) plt.show() # This partition selection is arbitrary, for illustrive purposes membership = list(hierarchy.fcluster(Z, t=1.15)) # Create collection of lists for blockmodel partition = defaultdict(list) for n, p in zip(list(range(len(G))), membership): partition[p].append(n) # [0, 179, 305] # print "Clustering [0, 179, 305]" # print l1[0][0], l1[179][0], l1[305][0] return list(partition.values())
def cluster(distances): names = list(set(itertools.chain.from_iterable(distances))) mat = numpy.zeros((len(names), len(names))) for i, name_i in enumerate(names): for j, name_j in enumerate(names): mat[i][j] = distances.get((name_i, name_j)) or distances.get((name_j, name_i)) or 0 condensed = distance.squareform(mat) linkage_matrix = hierarchy.complete(condensed) leaves_dict = {} traverse_tree(hierarchy.to_tree(linkage_matrix), leaves_dict) print(leaves_dict) with contextlib.closing(sqlite3.connect('voynich.db')) as connection: cursor = connection.cursor() cursor.execute('CREATE TABLE IF NOT EXISTS clusters(name, clusterid)') cursor.execute('DELETE FROM clusters') for key, values in leaves_dict.items(): for clusterid in values: cursor.execute('INSERT INTO clusters(name, clusterid) VALUES (?, ?)', (names[key], clusterid)) connection.commit()
def do_clustering(types, max_clust): """ Helper method for clustering that takes a list of all of the things being clustered (which are assumed to be binary numbers represented as strings), and an int representing the maximum number of clusters that are allowed. Returns: A dictionary mapping cluster ids to lists of numbers that are part of that cluster. """ #Fill in leading zeros to make all numbers same length. ls = [list(t[t.find("b")+1:]) for t in types] prepend_zeros_to_lists(ls) dist_matrix = pdist(ls, weighted_hamming) clusters = hierarchicalcluster.complete(dist_matrix) clusters = hierarchicalcluster.fcluster(clusters, max_clust, \ criterion="maxclust") #Group members of each cluster together cluster_dict = dict((c, []) for c in set(clusters)) for i in range(len(types)): cluster_dict[clusters[i]].append(types[i]) return cluster_dict
def CalculateClusterTree(self): fullMatrix = self.GenerateFullMatrix(self.results) dissMatrix = [] labels = fullMatrix.keys() for i in xrange(0, len(labels)): sampleNameI = labels[i] for j in xrange(i+1, len(labels)): sampleNameJ = labels[j] dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ]) # calculate hierarchical cluster tree if self.radioSingleLinkage.GetValue(): linkageMatrix = single(dissMatrix) elif self.radioUPGMA.GetValue(): linkageMatrix = average(dissMatrix) elif self.radioCompleteLinkage.GetValue(): linkageMatrix = complete(dissMatrix) elif self.radioWeighted.GetValue(): linkageMatrix = weighted(dissMatrix) root = to_tree(linkageMatrix) # create Newick string return self.CreateNewickString(root, labels) + ';'
#order = np.argsort(height, kind='mergesort') #a = a[order] #b = b[order] #height = height[order] if 1: import pylab as pl children = np.c_[a, b].astype(np.int) from sklearn.cluster.hierarchical import _hc_cut, ward_tree labels = _hc_cut(n_clusters=4, children=children, n_leaves=N) pl.figure(1) pl.clf() pl.scatter(X[:, 0], X[:, 1], c=labels, cmap=pl.cm.spectral) pl.title('Complete linkage') if 1: from scipy.cluster import hierarchy children_s = hierarchy.complete(X)[:, :2].astype(np.int) labels_s = _hc_cut(n_clusters=4, children=children_s, n_leaves=N) import pylab as pl pl.figure(0) pl.clf() pl.scatter(X[:, 0], X[:, 1], c=labels_s, cmap=pl.cm.spectral) pl.title('Complete linkage (scipy)') if 0: pl.figure(2) pl.clf() children_w, _, _ = ward_tree(X) labels_w = _hc_cut(n_clusters=4, children=children_w, n_leaves=N) pl.scatter(X[:, 0], X[:, 1], c=labels_w, cmap=pl.cm.spectral) pl.title('Ward') pl.show()
hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'average') hclust_model.fit(X) print('Cluster labels: {}\n'.format(hclust_model.labels_)) hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'complete') hclust_model.fit(X) print('Cluster labels: {}\n'.format(hclust_model.labels_)) print ''' ********************************************************************************************************************* scipy: dendrogram ********************************************************************************************************************* ''' # from: https://github.com/JWarmenhoven/ISLR-python/blob/master/Notebooks/Chapter%2010.ipynb fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18)) for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'], [ax1,ax2,ax3]): cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0) ax1.set_title('Complete Linkage') ax2.set_title('Average Linkage') ax3.set_title('Single Linkage') plt.show()
# entries of distance matrix AB = ds.cdtw(query, zchild[100+index], window, True) AC = np.array(dist).min() BC = ds.cdtw(zchild[100+best], zchild[100+index], window, True) print index, best, AC, AB # distance matrix M = np.array([[0, AB, AC], [AB, 0, BC], [AC, BC, 0]]) # label function L = lambda x: {0: "P", 1: "C", 2: "L"}[int(x)] # render dendrogram D = h.dendrogram(h.complete(M), orientation="left", leaf_label_func=L, link_color_func=lambda k: "b", leaf_font_size=40) # adjust clipping pl.axis((-2**10-200, np.max(D["dcoord"])*1.2, 0, 30)) # colors for time signals C = {"P": "b", "C": "b", "L": "r"} # list of signals signals = {"P": query, "C": zchild[100+index], "L": zchild[100+best]} # plot signals for offset, label in enumerate(D["ivl"]): pl.plot(range(-2**10-100, -100), signals[label]+offset*10+5, c=C[label])
#подготовка данных niks,cols,data,rec = model.get_data("%s%s.csv" % (worker.CSV_PATH, options.filename) ); logging.info("Prepared %d players and %d colums" % (len(niks), len(cols)) ); logging.info("\nFirstStep"); logging.info("Distance euclidean"); start = time.time(); euclid_data = pdist(data, 'euclidean'); logging.info("Time: %s" % (time.time() - start)); logging.info("Clustering start"); start = time.time(); Z = hierarchy.complete(euclid_data); worker.hierarchy_draw(Z, niks, 'study_complete_euclid', 0.4); logging.info("Time complete: %s" % (time.time() - start)); start = time.time(); Z = hierarchy.average(euclid_data); worker.hierarchy_draw(Z, niks, 'study_average_euclid', 0.25); logging.info("Time average: %s" % (time.time() - start)); start = time.time(); Z = hierarchy.weighted(euclid_data); worker.hierarchy_draw(Z, niks, 'study_weighted_euclid', 0.25); logging.info("Time weighted: %s" % (time.time() - start));
def compute_stability_fold(samples, train, test, method='ward', max_k=None, stack=False, stability=True, cv_likelihood=False, corr_score=None, ground_truth=None, n_neighbors=1, **kwargs): """ General function to compute the stability on a cross-validation fold. Parameters: ----------- samples : list of arrays List of arrays containing the samples to cluster, each array has shape (n_samples, n_features) in PyMVPA terminology. We are clustering the features, i.e., the nodes. train : list or array Indices for the training set. test : list or array Indices for the test set. method : {'complete', 'gmm', 'kmeans', 'ward'} Clustering method to use. Default is 'ward'. max_k : int or None Maximum k to compute the stability testing, starting from 2. By default it will compute up to the maximum possible k, i.e., the number of points. stack : bool Whether to stack or average the datasets. Default is False, meaning that the datasets are averaged by default. stability : bool Whether to compute the stability measure described in Lange et al., 2004. Default is True. cv_likelihood : bool Whether to compute the cross-validated likelihood for mixture model; only valid if 'gmm' method is used. Default is False. corr_score : {'pearson','spearman'} or None Whether to compute the specified type of correlation score. Default is None. ground_truth : array or None Array containing the ground truth of the clustering of the data, useful to compare stability against ground truth for simulations. n_neighbors : int Number of neighbors to use to predict clustering solution on test set using K-nearest neighbors. Currently used only for methods `complete` and `ward`. Default is 1. kwargs : optional Keyword arguments being passed to the clustering method (only for 'ward' and 'gmm'). Returns: -------- ks : array A (max_k-1,) array, where ks[i] is the `k` of the clustering solution for iteration `i`. ari : array A (max_k-1,) array, where ari[i] is the Adjusted Rand Index of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. ami : array A (max_k-1,) array, where ari[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. stab : array or None A (max_k-1,) array, where stab[i] is the stability measure described in Lange et al., 2004 for `k` of ks[i]. Note that this measure is the un-normalized one. It will be normalized later in the process. likelihood : array or None If method is 'gmm' and cv_likelihood is True, a (max_k-1,) array, where likelihood[i] is the cross-validated likelihood of the GMM clustering solution for `k` of ks[i]. Otherwise returns None. ari_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ari_gt[i] is the Adjusted Rand Index of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. ami_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ami_gt[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. stab_gt : array or None If ground_truth is not None, a (max_k-1,) array, where stab_gt[i] is the stability measure of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. corr : array or None Average correlation for each fold. TODO corr_gt : array or None Avg correlation against GT. TODO """ if method not in AVAILABLE_METHODS: raise ValueError('Method {0} not implemented'.format(method)) if cv_likelihood and method != 'gmm': raise ValueError( "Cross-validated likelihood is only available for 'gmm' method") # if max_k is None, set max_k to maximum value if not max_k: max_k = samples[0].shape[1] # preallocate arrays for results ks = np.zeros(max_k-1, dtype=int) ari = np.zeros(max_k-1) ami = np.zeros(max_k-1) if stability: stab = np.zeros(max_k-1) if cv_likelihood: likelihood = np.zeros(max_k-1) if corr_score is not None: corr = np.zeros(max_k-1) if ground_truth is not None: ari_gt = np.zeros(max_k-1) ami_gt = np.zeros(max_k-1) if stability: stab_gt = np.zeros(max_k-1) if corr_score is not None: corr_gt = np.zeros(max_k-1) # get training and test train_set = [samples[x] for x in train] test_set = [samples[x] for x in test] if stack: train_ds = np.vstack(train_set) test_ds = np.vstack(test_set) else: train_ds = np.mean(np.dstack(train_set), axis=2) test_ds = np.mean(np.dstack(test_set), axis=2) # compute clustering on training set if method == 'complete': train_ds_dist = pdist(train_ds.T, metric='correlation') test_ds_dist = pdist(test_ds.T, metric='correlation') # I'm computing the full tree and then cutting # afterwards to speed computation Y_train = complete(train_ds_dist) # same on testing set Y_test = complete(test_ds_dist) elif method == 'ward': (children_train, n_comp_train, n_leaves_train, parents_train) = ward_tree(train_ds.T, **kwargs) # same on testing set (children_test, n_comp_test, n_leaves_test, parents_test) = ward_tree(test_ds.T, **kwargs) elif method == 'gmm' or method == 'kmeans': pass # we'll have to run it for each k else: raise ValueError("We shouldn't get here") for i_k, k in enumerate(range(2, max_k+1)): if method == 'complete': # cut the tree with right K for both train and test train_label = cut_tree_scipy(Y_train, k) test_label = cut_tree_scipy(Y_test, k) # train a classifier on this clustering knn = KNeighborsClassifier(#algorithm='brute', # metric='correlation', n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'ward': # cut the tree with right K for both train and test train_label = _hc_cut(k, children_train, n_leaves_train) test_label = _hc_cut(k, children_test, n_leaves_test) # train a classifier on this clustering knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'gmm': gmm = GMM(n_components=k, **kwargs) # fit on train and predict test gmm.fit(train_ds.T) prediction_label = gmm.predict(test_ds.T) if cv_likelihood: log_prob = np.sum(gmm.score(test_ds.T)) # fit on test and get labels gmm.fit(test_ds.T) test_label = gmm.predict(test_ds.T) elif method == 'kmeans': kmeans = KMeans(n_clusters=k) # fit on train and predict test kmeans.fit(train_ds.T) prediction_label = kmeans.predict(test_ds.T) # fit on test and get labels kmeans.fit(test_ds.T) test_label = kmeans.predict(test_ds.T) else: raise ValueError("We shouldn't get here") # append results ks[i_k] = k ari[i_k] = adjusted_rand_score(prediction_label, test_label) ami[i_k] = adjusted_mutual_info_score(prediction_label, test_label) if stability: stab[i_k] = stability_score(prediction_label, test_label, k) if cv_likelihood: likelihood[i_k] = log_prob if corr_score is not None: corr[i_k] = correlation_score(prediction_label, test_label, test_ds, corr_score) if ground_truth is not None: ari_gt[i_k] = adjusted_rand_score(prediction_label, ground_truth) ami_gt[i_k] = adjusted_mutual_info_score(prediction_label, ground_truth) if stability: stab_gt[i_k] = stability_score(prediction_label, ground_truth, k) if corr_score is not None: corr_gt[i_k] = correlation_score(prediction_label, ground_truth, test_ds, corr_score) results = [ks, ari, ami] if stability: results.append(stab) else: results.append(None) if cv_likelihood: results.append(likelihood) else: results.append(None) if ground_truth is not None: results += [ari_gt, ami_gt] else: results += [None, None] if stability and ground_truth is not None: results.append(stab_gt) else: results.append(None) if corr_score is not None: results.append(corr) else: results.append(None) if corr_score is not None and ground_truth is not None: results.append(corr_gt) else: results.append(None) return results
def _run_complete(data, metric="correlation"): """Just to allow caching""" return complete(pdist(data, metric=metric))
def finalize_learning(self, grouping_method='AHC', spatial_pooler=None): """Finalize learning in the following steps: 1. Remove rare coincidences (done in SpatialPooler) 2. Compute coincidence priors 3. Make T symmetric 4. Normalize T by rows 5. Temporal grouping 6. Compute PCG """ def add_to_temporal_group(c_id, g_id=None): """Add coincidence to a new or to an existing temporal group. Args: c_id: coincidence index g_id: existing temporal group index Returns: group id if creating a new one """ if c_id not in nonassigned_coincidences: return nonassigned_coincidences.remove(c_id) if g_id is None: self.temporal_groups[len(self.temporal_groups)] = [c_id] return len(self.temporal_groups) - 1 else: if (len(self.temporal_groups[g_id]) < self.group_max_size): self.temporal_groups[g_id].append(c_id) # 2. Compute coincidence priors self.conincidence_prior = dict() count_sum = float(sum(self.coincidences_stats)) for c_id, count in enumerate(self.coincidences_stats): self.conincidence_prior[c_id] = count / count_sum # visualize.show_image(np.asarray(self.conincidence_prior.values()).reshape(10,20)) # 3. Make T symmetric if self.symmetrizeTAM: self.TAM = utils.symmetrize(self.TAM.T) # zero-out the diagonal for i in range(self.TAM.shape[0]): self.TAM[i, i] = 0 # 4. Normalize T by rows # for i in xrange(self.TAM.shape[0]): # for j in xrange(self.TAM.shape[1]): # if self.TAM[i].sum() > 0: # self.TAM[i, j] /= float(self.TAM[i].sum()) # normalize byt rows and columns row_max = self.TAM.max(axis=1).reshape((self.TAM.shape[1], 1)) col_max = self.TAM.max(axis=0).reshape((self.TAM.shape[0], 1)) self.TAM = np.nan_to_num(np.divide(self.TAM, np.sqrt(np.dot(row_max, col_max.T)))) # visualize.show_matrix(self.TAM) # 5. Temporal grouping if grouping_method == "AHC": # AHC algorithm # http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html # http://math.stanford.edu/~muellner/fastcluster.html import scipy.cluster.hierarchy as hier # AHC needs a distance matrix TAM_invs = 1 - self.TAM # Z = hier.average(TAM_invs) Z = hier.complete(TAM_invs) # Z = hier.weighted(TAM_invs) # Z = hier.centroid(TAM_invs) t = self.requested_group_count T = hier.fcluster(Z, t, criterion='maxclust') # T is a list of indices to groups for each of the coincidences # creating temporal groups based on T for c_id, g_id in enumerate(T): g_id = g_id - 1 if not g_id in self.temporal_groups.keys(): self.temporal_groups[g_id] = [c_id] else: self.temporal_groups[g_id].append(c_id) elif grouping_method == "Numenta": # greedy algorithm nonassigned_coincidences = range(self.coincidences_stats) # ids of coincidences while len(nonassigned_coincidences) > 0: # 5.1 Select the non-assigned coincidence c_i with the highest # temporal connection TC and add it to a new temporal group g_k. htc = -1 # highest temporal connection value htc_id = None # id of the coincidence for i in nonassigned_coincidences: if self.TAM[i].max() > htc: htc = self.TAM[i].max() htc_id = i assert(htc_id is not None) # add selected coincidence to a new temporal group g_id = add_to_temporal_group(htc_id) # 5.2 Pick at most topNeighbors non-assigned coincidences with the # highest temporal connection and pool them to the same group g_k j = 0 tmp = dict() while len(self.temporal_groups[g_id]) < self.group_max_size and \ len(nonassigned_coincidences) > 0: if not len(self.temporal_groups[g_id]) - 1 >= j: break htc_id = self.temporal_groups[g_id][j] tmp.clear() for k in range(self.TAM.shape[1]): tmp[k] = self.TAM[htc_id, k] # dict(c_id => temporal connection value) del tmp[htc_id] # remove previously selected c_id sorted_tmp = sorted(tmp, key=itemgetter(1), reverse=True)[0:self.top_neighbors] for c_id, tc in sorted_tmp: add_to_temporal_group(c_id, g_id) j += 1 # 5.1 purge garbage group # garbage group is the largest one, TODO use better metric if self.purge_garbage and spatial_pooler is not None: # find the largest temporal group garbage_id = 0 max_len = 0 for g_id in self.temporal_groups.keys(): if len(self.temporal_groups[g_id]) > max_len: max_len = len(self.temporal_groups[g_id]) garbage_id = g_id # delete all the coincidences in that group spatial_pooler.coincidences = utils.multi_delete(spatial_pooler.coincidences, self.temporal_groups[garbage_id]) self.coincidences_count = len(spatial_pooler.coincidences) spatial_pooler.coincidences_matrix = np.vstack(spatial_pooler.coincidences.values()) # delete the temporal group and change the indices so that they are # continuous del self.temporal_groups[garbage_id] for i in range(garbage_id, len(self.temporal_groups)): self.temporal_groups[i] = self.temporal_groups[i + 1] del self.temporal_groups[len(self.temporal_groups) - 1] # delete the last count = 0 for g in self.temporal_groups.values(): count += len(g) assert(count == self.coincidences_count) # 6. Compute PCG # 6.1 self.PCG = np.zeros((self.coincidences_count, len(self.temporal_groups))) # for i in self.coincidences_stats.keys(): for i in range(self.PCG.shape[0]): for j in range(self.PCG.shape[1]): if i in self.temporal_groups[j]: # if c_i is in g_j self.PCG[i, j] = self.conincidence_prior[i] # assign P(c_i) # 6.2 each column in PCG should sum up to 1 self.PCG = self.PCG.T for i in range(self.PCG.shape[0]): tsum = float(self.PCG[i].sum()) if tsum > 0: self.PCG[i] /= tsum
def test_cut_tree_scipy(): y = pdist(data, metric='euclidean') z = complete(y) assert_array_equal(np.sort(cut_tree_scipy(z, 2)), np.hstack((np.zeros(10), np.ones(10)))) assert_equal(len(np.unique(cut_tree_scipy(z, 10))), 10)
def new_heatmap(figure_number, expr_values, study, platform, sample_ids, symbols, combined): """ Create a heatmap with row and column dendrograms for the array of expression values. The expression values are normalized before clustering is performed. The Euclidean distance method is used. The Complete clustering method is used. This code is based on the iPython notebook located here: nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/ hierarchical_clustering_heatmaps_gridspec.ipynb :param expr_values: a numpy array of expression values :param study: :param platform: :param sample_ids: :param symbols: :param combined: Boolean, is this a combined heatmap? """ symbols = np.array(symbols) sample_ids = np.array(sample_ids) # Normalize the expression values expr_values /= np.max(np.abs(expr_values), axis=0) # Get the transpose of the expression array to be used in row # distance measurements and hierarchical clustering expr_values_transposed = np.transpose(expr_values) # Calculate the pairwise distances for the rows and columns # The default method is 'euclidean' column_distances = pdist(expr_values) row_distances = pdist(expr_values_transposed) # Create a Figure to hold all of the graphical elements # Set its background to white # Lay out a GridSpec dividing the figure inot 3 rows and 2 columns # Area [1,1] = column dendrogram # Area [2,0] = row dendrogram # Area [2,1] = the heatmap # Area [1,2] = the colorbar legend fig = plt.figure() # fig = Figure() fig.set_tight_layout(True) fig.patch.set_facecolor('white') heatmap_GS = gridspec.GridSpec(3, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1], height_ratios=[0.05, 0.25, 1]) # Perform a cluster analysis on column distances using the complete method. # Create and draw a dendrogram # Save the reordering values ('leaves') for the columns column_cluster = sch.complete(column_distances) column_dendrogram_axis = fig.add_subplot(heatmap_GS[1, 1]) column_dendrogram = sch.dendrogram(column_cluster, orientation='top') column_indexes = column_dendrogram['leaves'] clean_axis(column_dendrogram_axis) # Perform a cluster analysis on row distances using the complete method- # Create and draw a dendrogram. # Save the reordering values ('leaves') for the rows row_cluster = sch.complete(row_distances) row_dendrogram_axis = fig.add_subplot(heatmap_GS[2, 0]) row_dendrogram = sch.dendrogram(row_cluster, orientation='right') row_indexes = row_dendrogram['leaves'] clean_axis(row_dendrogram_axis) # Reorder the normalized expression value array based on the clustering indexes # Create and draw the heatmap. The image itself is used to create the colorbar (below) expr_values_transposed = expr_values_transposed[:, column_indexes] expr_values_transposed = expr_values_transposed[row_indexes, :] heat_map_axis = fig.add_subplot(heatmap_GS[2, 1]) image = heat_map_axis.matshow(expr_values_transposed, aspect='auto', origin='lower', # cmap=RedBlackGreen()) cmap=cm.BrBG) clean_axis(heat_map_axis) # Prepare the heatmap row labels based on the gene symbols gene_symbols = [] for symbol in symbols: match_result = re.match(r"(.+)(_\d+)", symbol) if match_result: gene_symbols.append(match_result.group(1)) else: gene_symbols.append(symbol) genes = np.array(gene_symbols) heat_map_axis.set_yticks(np.arange(len(genes))) heat_map_axis.yaxis.set_ticks_position('right') heat_map_axis.set_yticklabels(genes[row_indexes]) # Prepare the heatmap column labels based on the sample ids # Rotate them 90 degrees heat_map_axis.set_xticks(np.arange(sample_ids.shape[0])) heat_map_axis.xaxis.set_ticks_position('bottom') xlabels = heat_map_axis.set_xticklabels(sample_ids[column_indexes]) for label in xlabels: label.set_rotation(90) # Create and draw a scale colorbar. It is based on the values used in the # heatmap image. scale_cbGSSS = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=heatmap_GS[1, 0], wspace=0.0, hspace=0.0) scale_cb_axis = fig.add_subplot(scale_cbGSSS[0, 0]) colorbar = fig.colorbar(image, scale_cb_axis) colorbar.ax.yaxis.set_ticks_position('left') colorbar.ax.yaxis.set_label_position('left') colorbar.outline.set_linewidth(0) tick_labels = colorbar.ax.yaxis.get_ticklabels() for tick_label in tick_labels: tick_label.set_fontsize(tick_label.get_fontsize() - 4) # "Tighten" up the whole figure, separating the sub plots by horizontal and vertical spaces # Add a title - placed at the very top heatmap_GS.tight_layout(fig, h_pad=0.1, w_pad=0.5) title = "Study: %s Platform(s): %s" % (study, platform,) fig.suptitle(title) fig.set_size_inches(12.0, 8.0) canvas = FigureCanvas(fig) # plot_file_name = 'heatmap%s.png' % figure_number # plot_file = os.path.join(settings.MEDIA_ROOT, plot_file_name) # canvas.print_figure(plot_file) # # return '/media/' + plot_file_name plot_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False) file_name = plot_file.name.split('/')[-1] canvas.print_figure(plot_file) return settings.MEDIA_URL+file_name
def main(): usage = """ ./helix_orienation_divergences.py Analyze how much the helix-helix orientations diverge between two data sets. """ num_args=0 parser = OptionParser() parser.add_option('-r', '--resolution', dest='resolution', default=10, help="The resolution of the resulting plot", type='int') parser.add_option('-a', '--angle', dest='angle', default=0, help="The angle of the camera", type='float') parser.add_option('-f', '--fig-name', dest='fig_name', default='', help="The name of the file to save the figure to. If it is not specified, the figure will not be saved", type='str') parser.add_option('-i', '--interior_loops', dest='interior_loops', default=False, help='Cluster only the interior loops', action='store_true') parser.add_option('-m', '--multi_loops', dest='multi_loops', default=False, help='Cluster only the interior loops', action='store_true') #parser.add_option('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option') (options, args) = parser.parse_args() if len(args) < num_args: parser.print_help() sys.exit(1) column_names = ['type', 'pdb', 's1', 's2', 'u', 'v', 't', 'r', 'u1', 'v1', 'atype', 'something1', 'something2', 'sth3', 'sth4'] real_stats = ftms.ConformationStats('fess/stats/real.stats').angle_stats sampled_stats = ftms.ConformationStats('fess/stats/temp.stats').angle_stats # count how many statistics we have for each statistic type stat_counts = c.defaultdict(int) for sc in real_stats.keys(): stat_counts[sc] += len(real_stats[sc]) histograms = dict() for b in stat_counts.keys(): if b[2] != 2.: # only look at type 2 angles continue if options.interior_loops: if b[0] == 1000 or b[1] == 1000: continue if options.multi_loops: if b[0] != 1000 and b[1] != 1000: continue (selected_sizes, count) = get_nearest_dimension_sizes(b, stat_counts, 1) if count < 3: continue fud.pv('b, selected_sizes') combined_real = [] # get the statistics that correspond to the selected sampled sizes for ss in selected_sizes: #ss_r = get_certain_angle_stats(real_stats, ss) ss_r = real_stats[ss] combined_real += list(ss_r[['u','v']].as_matrix()) num_points = len(combined_real) combined_real = np.array(combined_real) #histograms[b] = (np.histogram2d(combined_real[:,0], combined_real[:,1], range=[[0, m.pi], [-m.pi, m.pi]])[0] + 0.5) / float(num_points) histograms[b] = combined_real dists = [] named_dists = dict() pp_dists = dict() for k1, k2 in it.combinations(histograms.keys(), 2): per_point_distances = [] for p1 in histograms[k1]: point_distances = [] for p2 in histograms[k2]: point_distances += [ftuv.magnitude(p1 - p2)] per_point_distances += [min(point_distances)] for p2 in histograms[k2]: point_distances = [] for p1 in histograms[k1]: point_distances += [ftuv.magnitude(p1-p2)] per_point_distances += [min(point_distances)] dists += [max(per_point_distances)] named_dists[(k1,k2)] = max(per_point_distances) pp_dists[(k1,k2)] = per_point_distances ''' kl = histograms[k1] * (histograms[k1] / histograms[k2]) kl = sum(map(sum, kl)) dists += [kl] ''' fud.pv('dists') Z = sch.complete(dists) fud.pv('Z') sch.dendrogram(Z, labels = histograms.keys(), leaf_rotation=90) plt.subplots_adjust(bottom=0.25) plt.show() k1 = (6,7,2) k2 = (5,6,2) rs = get_certain_angle_stats(real_stats, k1) ss = get_certain_angle_stats(real_stats, k2) fud.pv('named_dists[(k1,k2)]') fud.pv('pp_dists[(k1,k2)]') real_us = rs[['u', 'v']].as_matrix() sampled_us = ss[['u','v']].as_matrix() U_r = real_us[:,0] V_r = real_us[:,1] U_s = sampled_us[:,0] V_s = sampled_us[:,1] total_r = len(U_r) total_s = len(U_s) hr = np.histogram2d(U_r, V_r) hs = np.histogram2d(U_s, V_s) pseudo_r = (hr[0] + 1) / total_r pseudo_s = (hs[0] + 1) / total_r kl = pseudo_r * (pseudo_r / pseudo_s) fud.pv('kl') fud.pv('sum(map(sum, kl))') X_r = np.sin(U_r) * np.cos(V_r) Y_r = np.sin(U_r) * np.sin(V_r) Z_r = np.cos(U_r) r = 1. X_s = r * np.sin(U_s) * np.cos(V_s) Y_s = r * np.sin(U_s) * np.sin(V_s) Z_s = r * np.cos(U_s) fud.pv('real_us') real_us_orig = np.copy(real_us) sampled_us_orig = np.copy(sampled_us) print len(real_us), len(sampled_us) fig = plt.figure(figsize=(10,10)) ax = Axes3D(fig) a = Arrow3D([-1.3,1.3],[0,0],[0,0], mutation_scale=20, lw=5, arrowstyle="-|>", color="g") ax.add_artist(a) ax.plot(X_r, Y_r, Z_r, 'bo', alpha=0.3) ax.plot(X_s, Y_s, Z_s, 'ro', alpha=0.3) u, v = np.mgrid[0:2*np.pi:20j, 0:np.pi:10j] x=np.cos(u)*np.sin(v) y=np.sin(u)*np.sin(v) z=np.cos(v) ax.plot_wireframe(x, y, z, color="y") #surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors, # linewidth=0, antialiased=False) ax._axis3don=False ax.set_zlim3d(-1, 1) ax.w_zaxis.set_major_locator(LinearLocator(6)) ax.view_init(0, options.angle) ''' plt.subplots_adjust(left=0.4, right=0.9, top=0.9, bottom=0.1) for i in xrange(0, 360, 40): savefig("fig%d.png", (i)) ''' ''' sm = cm.ScalarMappable(cmap=cm.jet) sm.set_array(W) fig.colorbar(sm) ''' if options.fig_name != "": plt.savefig(options.fig_name, bbox_inches='tight') else: plt.show()