def __init__(self, dm, link): self._nodes = list(dm.index) self._newick = None if link == "single": self._linkage = fastcluster.single(squareform(dm.distance)) elif link == "average": self._linkage = fastcluster.average(squareform(dm.distance)) else: raise AttributeError("Invalid value {} for link in Dendrogram.".format(link)) self._tree = hierarchy.to_tree(self._linkage, False)
def make_tree(self, profile_file, names=None): profiles = pd.read_csv(profile_file, sep="\t", index_col=0) new_names = {} if names: for i in names: new_names[i] = re.sub(r"_S\d+_L\d{3}[\d\w_-]+", "", names[i]).replace("-", ".") profiles.columns = list(map(lambda x: new_names[x], profiles.columns)) self._nodes = list(profiles.columns) distances = distance_matrix(profiles) self._linkage = fastcluster.average(squareform(distances)) self._tree = hierarchy.to_tree(self._linkage, False)
def set_threshold(arr, CLUSTERING='single'): print("starting clustering") arr = arr.reshape(-1) arr = arr[arr > settings.MIN_TH] N_CLUSTER = 2 target_cluster = 1 print("max, min: ", arr.max(), arr.min()) arr = arr[iqr(arr)] if CLUSTERING == 'kmeans': from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=N_CLUSTER, init=np.array([settings.MIN_TH, arr.max()]).reshape(-1, 1)) labels = kmeans.fit_predict(arr.reshape(-1, 1)) else: import fastcluster from scipy.cluster.hierarchy import fcluster from scipy.spatial.distance import pdist Z = pdist(arr.reshape(-1, 1)) if CLUSTERING == 'single': X = fastcluster.single(Z) elif CLUSTERING == 'average': X = fastcluster.average(Z) elif CLUSTERING == 'centroid': X = fastcluster.centroid(Z) else: return settings.THRESHOLD labels = N_CLUSTER - fcluster(X, N_CLUSTER, 'maxclust') # setting 0 for the minimum cluster # np.ma.masked_array returns only values where the mask is 0 index = {} for i, l in enumerate(labels): index[l] = arr[i] if len(index.keys()) == N_CLUSTER: break index = sorted(index.items(), key=lambda kv: kv[1]) # list of tuples sorted by values target_label = index[target_cluster - 1][0] # the label of the desired cluster th = np.max(arr[np.flatnonzero(labels == target_label)]) # max of the down cluster print("found threshold: " + str(th)) # print(str(np.ma.masked_array(arr, 1 - labels).min())) return th
def average_linkage_clustering(pairwise_estimates): """ Perform average linkage clustering using ``fastcluster``. The first two columns of the output contain the node indices which are joined in each step. The input nodes are labeled 0, . . . , N - 1, and the newly generated nodes have the labels N, . . . , 2N - 2. The third column contains the distance between the two nodes at each step, ie. the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. :param pairwise_estimates: dictionary with data frames with pairwise estimates of Ks, Ka and Ka/Ks (or at least Ks), as returned by :py:func:`analyse_family`. :return: average linkage clustering as performed with ``fastcluster.average``. """ clustering = fastcluster.average(pairwise_estimates) return clustering
def average_linkage_clustering(pairwise_estimates): """ Perform average linkage clustering using ``fastcluster``. The first two columns of the output contain the node indices which are joined in each step. The input nodes are labeled 0, . . . , N - 1, and the newly generated nodes have the labels N, . . . , 2N - 2. The third column contains the distance between the two nodes at each step, ie. the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. :param pairwise_estimates: dictionary with data frames with pairwise estimates of Ks, Ka and Ka/Ks (or at least Ks), as returned by :py:func:`analyse_family`. :return: average linkage clustering as performed with ``fastcluster.average``. """ # fill NaN values with something larger than all the rest, not a foolproof # approach, but should be reasonable in most cases if np.any(np.isnan(pairwise_estimates)): logging.warning("Ks matrix contains NaN values, replaced with 1000") pairwise_estimates.fillna(1000, inplace=True) clustering = fastcluster.average(pairwise_estimates) return clustering
def cluster(dataset, new_matrix = None): genes, conditions, matrix = dataset print 'shape of original matrix:' print matrix.shape # Get the indices of NaNs in the matrix nan_inds = np.isnan(matrix) num_nans = np.sum(nan_inds) # Replace the NaN values with extremely small noise values # (noise has standard deviation of 1 million times less than the data) np.random.seed(96857463) data_sd = np.nanstd(matrix) noise = np.random.randn(num_nans) * data_sd / 1e6 matrix[nan_inds] = noise ## Remove rows and columns that do not have any values - they kill the clustering process! ## print 'row_nansum: {}'.format(np.nansum(matrix, axis = 1)) ## print 'col_nansum: {}'.format(np.nansum(matrix, axis = 0)) #good_rows = np.nansum(matrix, axis = 1).astype(np.bool) #good_cols = np.nansum(matrix, axis = 0).astype(np.bool) #print 'number of good rows: {}'.format(np.nansum(good_rows)) #print 'number of good cols: {}'.format(np.nansum(good_cols)) #genes = np.array(genes)[good_rows] #conditions = np.array(conditions)[good_cols] #matrix = matrix[np.ix_(good_rows, good_cols)] #print 'shape of good matrix:' #print matrix.shape num_genes = len(genes) num_conds = len(conditions) # Compute distance matrices cols_dist = pdist(matrix.transpose(), 'cosine') rows_dist = pdist(matrix, 'cosine') ## Get the names of rows and columns that have NaN dissimilarity values #rows_dist_nan_inds_1, rows_dist_nan_inds_2 = [x[np.isnan(rows_dist)] for x in np.triu_indices(matrix.shape[0], 1)] #cols_dist_nan_inds_1, cols_dist_nan_inds_2 = [x[np.isnan(cols_dist)] for x in np.triu_indices(matrix.shape[1], 1)] #row_names_nan_dist_1, row_names_nan_dist_2 = genes[rows_dist_nan_inds_1], genes[rows_dist_nan_inds_2] #col_names_nan_dist_1, col_names_nan_dist_2 = conditions[cols_dist_nan_inds_1], conditions[cols_dist_nan_inds_2] ## And print out the rows (strains) and columns (conditions) with NaN dissimilarity values #print "Strain pairs with NaN dissimilarity values:" #for i, row_name_1 in enumerate(row_names_nan_dist_1): # print row_name_1, row_names_nan_dist_2[i] #print "" # #print "Condition pairs with NaN dissimilarity values:" #for i, col_name_1 in enumerate(col_names_nan_dist_1): # print col_name_1, col_names_nan_dist_2[i] #print "" # Cluster the matrix using fastcluster! print 'clustering columns...' cols_clust_mat = fastcluster.average(cols_dist) print 'clustering rows...' rows_clust_mat = fastcluster.average(rows_dist) # Transform the values in the clustering matrices so they can be used with Bio.Cluster for i in range(num_genes - 1): if rows_clust_mat[i, 0] > (num_genes - 1): rows_clust_mat[i, 0] = -(rows_clust_mat[i, 0] - (num_genes - 1)) if rows_clust_mat[i, 1] > (num_genes - 1): rows_clust_mat[i, 1] = -(rows_clust_mat[i, 1] - (num_genes - 1)) for i in range(num_conds - 1): if cols_clust_mat[i, 0] > (num_conds - 1): cols_clust_mat[i, 0] = -(cols_clust_mat[i, 0] - (num_conds - 1)) if cols_clust_mat[i, 1] > (num_conds - 1): cols_clust_mat[i, 1] = -(cols_clust_mat[i, 1] - (num_conds - 1)) # Turn into lists of nodes cols_nodes_list = [Node(int(cols_clust_mat[i, 0]), int(cols_clust_mat[i, 1]), cols_clust_mat[i, 2]) for i in range(cols_clust_mat.shape[0])] rows_nodes_list = [Node(int(rows_clust_mat[i, 0]), int(rows_clust_mat[i, 1]), rows_clust_mat[i, 2]) for i in range(rows_clust_mat.shape[0])] # Create trees cols_tree = Tree(cols_nodes_list) rows_tree = Tree(rows_nodes_list) # Add the NaNs back into the matrix, so it can be visualized properly matrix[nan_inds] = np.nan # If a "new_matrix" was specified, that means that we wanted to use the original dataset to # get the clustering but then actually use a different matrix for the data. So, at this point # we set the variable "matrix" to be the values of "new_matrix" if new_matrix is not None: matrix = new_matrix # Create a giant text string so that the input data can be turned into a "record" object row1 = 'ORF\tNAME\tGWEIGHT\t' + '\t'.join(conditions) row2 = 'EWEIGHT\t\t\t' + '\t'.join(['1' for i in range(len(conditions))]) rows_rest = [['' for i in range(len(conditions) + 3)] for j in range(len(genes))] for i in range(len(genes)): rows_rest[i][0:2] = [genes[i] for j in range(2)] rows_rest[i][2] = '1' for j in range(len(conditions)): rows_rest[i][j+3] = str(matrix[i, j]) rows_rest_intermed = ['\t'.join(x) for x in rows_rest] rows_rest_final = '\n'.join(rows_rest_intermed) final_string = '%s\n%s\n%s' % (row1, row2, rows_rest_final) # Read in as a "record" object handle = StringIO.StringIO(final_string) record = Bio.Cluster.read(handle) return record, rows_tree, cols_tree
import scipy.cluster.hierarchy as hcluster random.seed(42) np.random.seed(42) regions = dgw.data.parsers.read_bed('encode_regions_around_tss.bed') random_regions = regions.ix[random.sample(regions.index, 1000)] data = dgw.read_bam('/Users/saulius/dev/coursework/proj/data/interesting/broad/K562/wgEncodeBroadHistoneK562H3k4me3StdAlnRep1.bam', random_regions) data = data.to_log_scale() dm = dgw.dtw.parallel.parallel_pdist(data) single = fastcluster.single(dm) complete = fastcluster.complete(dm) average = fastcluster.average(dm) hcluster.dendrogram(single, no_labels=True, color_threshold=0) plt.title('Single linkage') # plt.savefig('single.pdf') # plt.close('all') # # hcluster.dendrogram(complete, no_labels=True, color_threshold=0) # plt.title('Complete linkage') # plt.savefig('complete.pdf') # plt.close('all') # # hcluster.dendrogram(average, no_labels=True, color_threshold=0) # plt.title('Average linkage') # plt.savefig('average.pdf') # plt.close('all')
def cluster_weights( X: "npt.ArrayLike", y: "npt.ArrayLike", grouping: "Optional[npt.ArrayLike]" = None, ) -> np.ndarray: """ Compute clusters on the X values based on Manhattan distance, then weight by cluster size. This function ignores information in the y-values. Examples: >>> import numpy as np >>> from selectml.sk.weighting import cluster_weights >>> from selectml.data import basic >>> X, y, indivs = basic() >>> cluster_weights(X, y) array([4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.]) """ from fastcluster import average from scipy.cluster.hierarchy import cut_tree from scipy.cluster.hierarchy import cophenet from scipy.spatial.distance import pdist, squareform X_ = np.array(X) x = pd.DataFrame({ "index": np.arange(X_.shape[0]), "genotypes": np.apply_along_axis( lambda z: "".join(str(z_i) for z_i in z), 1, X_) }) firsts = pd.DataFrame(X_).groupby(x["genotypes"]).first() groups = ( x .groupby("genotypes")["index"] .unique() .apply(pd.Series) .unstack() .reset_index(level=0, drop=True) .reset_index() .rename(columns={0: "index"}) ) dist = pdist(firsts.values, "cityblock") hier = average(dist) coph = squareform(cophenet(hier)) height = np.percentile(coph[coph > 0], 0.5) clusters = pd.DataFrame({ "genotypes": firsts.index.values, "clusters": cut_tree(hier, height=height)[:, 0] }) clusters = ( pd.merge(groups, clusters, left_on="genotypes", right_on="genotypes") .drop(columns="genotypes") ) cluster_counts = ( clusters.groupby("clusters").count()["index"] .apply(lambda x: (clusters.shape[0] - x) / x) .reset_index() .rename(columns={"index": "weight"}) ) clusters = pd.merge( clusters, cluster_counts, on="clusters" ).set_index("index") clusters = clusters.loc[np.arange(X_.shape[0]), "weight"] return clusters.values
kmerdist[total:total+position] = temp[r][remaining:] total+=position position-=1 remaining+=1 del(kmerdist) gc.collect() kmerdist=np.memmap(filename,dtype='float32',mode='r') else: print("Writing distance matrix to disk. {}".format(time.asctime())) kmerdist[:] = pdist(final,'hamming') else: #if neither memory saving strategies are selected kmerdist = pdist(final,'hamming') print("Building kmer tree using average linkage with an average number of allowed based of: {} {}".format(degen_base_num,time.asctime())) Z = fastcluster.average(kmerdist) kmer_length=final.shape[1] maxdist=round((degen_base_num/kmer_length), 2) clusters = fcluster(Z,maxdist,criterion='distance') myclusters = {key:[] for key in set(clusters)} for index, clust in enumerate(clusters): myclusters[clust].append(index) clustergroups = [] for amp in Counter(clusters).keys(): clustergroups.append(final.iloc[myclusters[amp]]) print("Building alignments for kmer motifs. {}".format(time.asctime())) #group resulting clusters into de facto alignment objects alignments = [] for c in clustergroups: