def __init__(self, dm, link): self._nodes = list(dm.index) self._newick = None if link == "single": self._linkage = fastcluster.single(squareform(dm.distance)) elif link == "average": self._linkage = fastcluster.average(squareform(dm.distance)) else: raise AttributeError("Invalid value {} for link in Dendrogram.".format(link)) self._tree = hierarchy.to_tree(self._linkage, False)
def _rsl_large_kdtree_fastcluster(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2): if p is None: p = 2 mutual_reachability_ = kdtree_pdist_mutual_reachability(X, metric, p, k, alpha) single_linkage_tree = single(mutual_reachability_) single_linkage_tree = SingleLinkageTree(single_linkage_tree) labels = single_linkage_tree.get_clusters(cut, gamma) return labels, single_linkage_tree
def set_threshold(arr, CLUSTERING='single'): print("starting clustering") arr = arr.reshape(-1) arr = arr[arr > settings.MIN_TH] N_CLUSTER = 2 target_cluster = 1 print("max, min: ", arr.max(), arr.min()) arr = arr[iqr(arr)] if CLUSTERING == 'kmeans': from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=N_CLUSTER, init=np.array([settings.MIN_TH, arr.max()]).reshape(-1, 1)) labels = kmeans.fit_predict(arr.reshape(-1, 1)) else: import fastcluster from scipy.cluster.hierarchy import fcluster from scipy.spatial.distance import pdist Z = pdist(arr.reshape(-1, 1)) if CLUSTERING == 'single': X = fastcluster.single(Z) elif CLUSTERING == 'average': X = fastcluster.average(Z) elif CLUSTERING == 'centroid': X = fastcluster.centroid(Z) else: return settings.THRESHOLD labels = N_CLUSTER - fcluster(X, N_CLUSTER, 'maxclust') # setting 0 for the minimum cluster # np.ma.masked_array returns only values where the mask is 0 index = {} for i, l in enumerate(labels): index[l] = arr[i] if len(index.keys()) == N_CLUSTER: break index = sorted(index.items(), key=lambda kv: kv[1]) # list of tuples sorted by values target_label = index[target_cluster - 1][0] # the label of the desired cluster th = np.max(arr[np.flatnonzero(labels == target_label)]) # max of the down cluster print("found threshold: " + str(th)) # print(str(np.ma.masked_array(arr, 1 - labels).min())) return th
def _hdbscan_large_kdtree_fastcluster(X, min_cluster_size=5, min_samples=None, metric='minkowski', p=2): if p is None: p = 2 mutual_reachability_ = kdtree_pdist_mutual_reachability(X, metric, p, min_samples) single_linkage_tree = single(mutual_reachability_) condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) cluster_list = get_clusters(condensed_tree, stability_dict) labels = -1 * np.ones(X.shape[0], dtype=int) for index, cluster in enumerate(cluster_list): labels[cluster] = index return labels, condensed_tree, single_linkage_tree, None
def _rsl_large_kdtree_fastcluster(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2): if p is None: p = 2 mutual_reachability_ = kdtree_pdist_mutual_reachability( X, metric, p, k, alpha) single_linkage_tree = single(mutual_reachability_) single_linkage_tree = SingleLinkageTree(single_linkage_tree) labels = single_linkage_tree.get_clusters(cut, gamma) return labels, single_linkage_tree
def _hdbscan_large_kdtree_fastcluster(X, min_cluster_size=5, min_samples=None, alpha=1.0, metric='minkowski', p=2, gen_min_span_tree=False): if p is None: p = 2 mutual_reachability_ = kdtree_pdist_mutual_reachability(X, metric, p, min_samples, alpha) single_linkage_tree = single(mutual_reachability_) condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) cluster_list = get_clusters(condensed_tree, stability_dict) labels = -1 * np.ones(X.shape[0], dtype=int) probabilities = np.zeros(X.shape[0], dtype=float) for index, (cluster, prob) in enumerate(cluster_list): labels[cluster] = index probabilities[cluster] = prob return labels, probabilities, condensed_tree, single_linkage_tree, None
def _hdbscan_large_kdtree_fastcluster(X, min_cluster_size=5, min_samples=None, metric='minkowski', p=2): if p is None: p = 2 mutual_reachability_ = kdtree_pdist_mutual_reachability( X, metric, p, min_samples) single_linkage_tree = single(mutual_reachability_) condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) cluster_list = get_clusters(condensed_tree, stability_dict) labels = -1 * np.ones(X.shape[0], dtype=int) for index, cluster in enumerate(cluster_list): labels[cluster] = index return labels, condensed_tree, single_linkage_tree, None
def main(): infile = sys.argv[1] outfile = sys.argv[2] data = np.genfromtxt(infile, delimiter=',') print('Received {} points, clustering...'.format(data.shape[0])) if data.size > 0: clusters = fastcluster.single(data) print('Finished clustering') else: clusters = [] print('Insufficient data to cluster') # from of the output: an (N-1)*4 matrix where each row is the 2 joined # indices along with the distance and number of points with open(sys.argv[2], 'w') as f: string_repr = '' for row in clusters: string_repr += ','.join(map(str, row)) string_repr += '\n' f.write(string_repr)
def _hdbscan_large_kdtree_fastcluster(X, min_cluster_size=5, min_samples=None, alpha=1.0, metric='minkowski', p=2, gen_min_span_tree=False): if p is None: p = 2 mutual_reachability_ = kdtree_pdist_mutual_reachability( X, metric, p, min_samples, alpha) single_linkage_tree = single(mutual_reachability_) condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) cluster_list = get_clusters(condensed_tree, stability_dict) labels = -1 * np.ones(X.shape[0], dtype=int) probabilities = np.zeros(X.shape[0], dtype=float) for index, (cluster, prob) in enumerate(cluster_list): labels[cluster] = index probabilities[cluster] = prob return labels, probabilities, condensed_tree, single_linkage_tree, None
# id_series = pd.Series(url_list) # cluster_series = pd.Series(km.labels_) # results = (pd.concat([id_series,cluster_series], axis=1)) # results.columns = ['id', 'cluster'] # results.to_csv("clustering_f.txt", sep=',', columns=['id', 'cluster'], header=False, index=False, encoding='utf-8') # print("Time taken for storing results of flat clustering: ", time.time() - start_time) # Apply Hierarchical Clustering (Single link) dist = 1 - cosine_similarity(X) print("Time taken for computing cosine similarity: ", time.time() - start_time) agg_d = fastcluster.linkage(dist, method='single', metric='euclidean') print("Time taken for single linkage: ", time.time() - start_time) fig, ax = plt.subplots() ax = dendrogram(fastcluster.single(agg_d), orientation="right", labels=url_list) print("Time taken for applying hierarchical clustering: ", time.time() - start_time) # Get labels for key in ax: if key == "ivl": ward_key = ax[key] if key == "color_list": ward_dict = dict([(y,x+1) for x,y in enumerate(sorted(set(ax[key])))]) ward_value = [ward_dict[x] for x in ax[key]] print("Time taken for getting labels: ", time.time() - start_time) # Store hierarchical clustering results in a file ward_cluster_series = pd.Series(ward_value) ward_id_series = pd.Series(ward_key) ward_results = (pd.concat([ward_id_series,ward_cluster_series], axis=1))
import fastcluster import matplotlib.pyplot as plt import scipy.cluster.hierarchy as hcluster random.seed(42) np.random.seed(42) regions = dgw.data.parsers.read_bed('encode_regions_around_tss.bed') random_regions = regions.ix[random.sample(regions.index, 1000)] data = dgw.read_bam('/Users/saulius/dev/coursework/proj/data/interesting/broad/K562/wgEncodeBroadHistoneK562H3k4me3StdAlnRep1.bam', random_regions) data = data.to_log_scale() dm = dgw.dtw.parallel.parallel_pdist(data) single = fastcluster.single(dm) complete = fastcluster.complete(dm) average = fastcluster.average(dm) hcluster.dendrogram(single, no_labels=True, color_threshold=0) plt.title('Single linkage') # plt.savefig('single.pdf') # plt.close('all') # # hcluster.dendrogram(complete, no_labels=True, color_threshold=0) # plt.title('Complete linkage') # plt.savefig('complete.pdf') # plt.close('all') # # hcluster.dendrogram(average, no_labels=True, color_threshold=0) # plt.title('Average linkage')