def flat_clusters(self, n=8, init=1, criterion='maxclust'): """ Returns flat clusters from the linkage matrix :Z: """ if criterion is 'distance': self.T = hierarchy.fcluster(self.Z, init, criterion='distance') a = 0 while a < 20: if self.T.max() < n: init = init - 0.02 a += 1 elif self.T.max() > n: init = init + 0.02 a += 1 else: self.L, self.M = hierarchy.leaders(self.Z, self.T) return self.T self.T = hierarchy.fcluster(self.Z, init, criterion='distance') self.L, self.M = hierarchy.leaders(self.Z, self.T) return self.T elif criterion is 'inconsistent': self.T = hierarchy.fcluster(self.Z, criterion='inconsistent') self.L, self.M = hierarchy.leaders(self.Z, self.T) return self.T elif criterion is 'maxclust': self.T = hierarchy.fcluster(self.Z, t=n, criterion='maxclust') self.L, self.M = hierarchy.leaders(self.Z, self.T) return self.T else: print('Criteria not implemented') return 0
def optMDL(df): Z = getDist(df) tree = sc.to_tree(Z, rd=True)[1] minMDL = 1000000 optK = 0 desLength = 0 DList = [] for n_cluster in range(1, 11, 1): #range(df.shape[0]+1) N = fcluster(Z, n_cluster, criterion='maxclust') L, M = sc.leaders(Z, N) leaders = list(L) print(leaders) leafDict = {} for node in tree: if node.get_id() in leaders: key = node.get_id() if node.get_count() > 1: dist = getleafdict(node) else: dist = {key: 0} leafDict[key] = dist desLength = binning(leafDict) + n_cluster * np.log2(df.shape[0]) DList.append(desLength) #if desLength if desLength < minMDL: minMDL = desLength optK = n_cluster return optK, minMDL, DList
def test_leaders_single(self): # Tests leaders using a flat clustering generated by single linkage. X = hierarchy_test_data.Q_X Y = pdist(X) Z = linkage(Y) T = fcluster(Z, criterion='maxclust', t=3) Lright = (np.array([53, 55, 56]), np.array([2, 3, 1])) L = leaders(Z, T) assert_equal(L, Lright)
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'): Z = model['model'] mode = model['input_mode'] if mode == 'matrix': distance_matrix = model['dist_matrix'] out_table = model['linkage_matrix'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') if mode == 'original': prediction_table = table.copy() elif mode == 'matrix': prediction_table = distance_matrix prediction_table[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined_column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined_column2'][select_indices]) clusters_info_table = pd.DataFrame([]) clusters_info_table[cluster_col] = M clusters_info_table['name_of_clusters'] = which_cluster clusters_info_table = clusters_info_table.sort_values(cluster_col) cluster_count = np.bincount(prediction_table[cluster_col]) cluster_count = cluster_count[cluster_count != 0] clusters_info_table['num_of_entities'] = list(cluster_count) rb = BrtcReprBuilder() rb.addMD( strip_margin("""### Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" |### Parameters | |{display_params} | |## Clusters Information | |{clusters_info_table} | """.format(display_params=dict2MD(model['parameters']), clusters_info_table=pandasDF2MD(clusters_info_table)))) model = _model_dict('hierarchical_clustering_post') model['clusters_info'] = clusters_info_table model['_repr_brtc_'] = rb.get() return {'out_table': prediction_table, 'model': model}
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'): Z = model['model'] input_cols = model['input_cols'] params = model['parameters'] out_table = model['outtable'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') out_table2 = table.copy() out_table2[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined_column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined_column2'][select_indices]) out_table3 = pd.DataFrame([]) out_table3[cluster_col] = M out_table3['name_of_clusters'] = which_cluster out_table3 = out_table3.sort_values(cluster_col) cluster_count = np.bincount(out_table2[cluster_col]) cluster_count = cluster_count[cluster_count != 0] # data = {'cluster_name': ['prediction' + str(i) for i in range(1, num_clusters + 1)]} out_table3['num_of_entities'] = list(cluster_count) rb = ReportBuilder() rb.addMD(strip_margin("""### Hierarchical Clustering Post Process Result""")) rb.addMD(strip_margin(""" |### Parameters | |{display_params} | |## Clusters Information | |{out_table3} | """.format(display_params=dict2MD(params), out_table3=pandasDF2MD(out_table3)))) model = _model_dict('hierarchical_clustering_post') model['report'] = rb.get() return {'out_table2' : out_table2, 'model': model}
def _find_leaf_indxs_and_fig_posns(Z, ddata, ax): """ Plot a dendrogram into axes `ax` and return for each leaf cluster the item-indecies that belong to that cluster """ # find the lowest link y_merges = np.array(ddata["dcoord"]) d_max = np.min(y_merges[y_merges > 0.0]) d2 = Z[:, 2][np.argwhere(Z[:, 2] == d_max)[0][0] - 1] T = hc.fcluster(Z, t=d2, criterion="distance") L, M = hc.leaders(Z, T) assert set(L) == set(ddata["leaves"]) # get the actual leaf from the indecies (these were set by providing the # `leaf_label_func` above) leaf_indecies_from_labels = np.array( [int(lab.get_text()) for lab in ax.get_xticklabels()]).tolist() ax.set_xticklabels(np.arange(len(leaf_indecies_from_labels))) # work out which leaf each item (image) belongs to mapping = dict(zip(M, L)) leaf_mapping = np.array(list(map(lambda i: mapping[i], T))) # counts per leaf # [(n, sum(leaf_mapping == n)) for n in L] tile_idxs_per_cluster = {} for tile_id, leaf_id in enumerate(leaf_mapping): cluster_id = leaf_indecies_from_labels.index(leaf_id) cluster_tile_idxs = tile_idxs_per_cluster.setdefault(cluster_id, []) cluster_tile_idxs.append(tile_id) return tile_idxs_per_cluster
def hierarchial_sentences(X, **kwargs): '''Perform hierarchial clustering on a vector of sentences.''' matrix = tfidf_matrix(X, **kwargs) # hierarchial clustering linkage = sch.linkage(matrix, method = 'complete') cutoff = kwargs.get('cutoff_coef', 0.45)*max(linkage[:,2]) # create the plot fig = pylab.figure() axdendro = fig.add_axes([0.09,0.1,0.2,0.8]) denodrogram = sch.dendrogram(linkage, orientation='right', color_threshold=cutoff) axdendro.set_xticks([]) axdendro.set_yticks([]) # extract the indices indices = denodrogram['leaves'] matrix = matrix[indices,:] matrix = matrix[:,indices] axmatrix = fig.add_axes([0.3,0.1,0.6,0.8]) im = axmatrix.matshow(matrix, aspect='auto', origin='lower') axmatrix.set_xticks([]) axmatrix.set_yticks([]) axcolor = fig.add_axes([0.91,0.1,0.02,0.8]) pylab.colorbar(im, cax=axcolor) # flatten the clusters flat_clusters = sch.fcluster(linkage, cutoff, 'distance') leaders = sch.leaders(linkage, flat_clusters) return {'fig': fig, 'flat': flat_clusters, 'leaders': leaders[1]}
def dendrogram(da_embeddings, n_clusters_max=14, debug=False, ax=None, n_samples=10, show_legend=False, label_clusters=False, return_clusters=False, color="black", sampling_method="random", linkage_method="ward", **kwargs): """ Additional kwargs will be passed to scipy.cluster.hierarchy.dendrogram """ tile_dataset = ImageSingletDataset( data_dir=da_embeddings.data_dir, tile_type=da_embeddings.tile_type, stage=da_embeddings.stage, ) if ax is None: fig, ax = plt.subplots(figsize=(14, 3)) else: fig = ax.figure Z = hc.linkage( y=da_embeddings, method=linkage_method, ) if color is not None: kwargs["link_color_func"] = lambda k: color # we want to label the leaf by the index of the leaf node, at least # initially. Below we will change the labels to have the count in each # leaf, but we don't know that number yet leaf_label_func = lambda i: str(i) kwargs["leaf_label_func"] = leaf_label_func ddata = hc.dendrogram(Z=Z, truncate_mode="lastp", p=n_clusters_max, get_leaves=True, **kwargs) if debug: for ii in range(len(ddata["icoord"])): bl, br = list(zip(ddata["icoord"][ii], ddata["dcoord"][ii]))[ 0::3] # second and third are top left and right corners ax.scatter(*bl, marker="s", label=ii, s=100) ax.scatter(*br, marker="s", label=ii, s=100) # find the lowest link y_merges = np.array(ddata["dcoord"]) d_max = np.min(y_merges[y_merges > 0.0]) d2 = Z[:, 2][np.argwhere(Z[:, 2] == d_max)[0][0] - 1] if debug: plt.axhline(d_max, linestyle="--", color="grey") ax.legend() T = hc.fcluster(Z, t=d2, criterion="distance") L, M = hc.leaders(Z, T) assert set(L) == set(ddata["leaves"]) # getting leaf locations # the order in `L` (leaders) above unfortunately is *not* same as the order # of points in icoord so instead we pick up the order from the actual # labels used bl_pts = np.array([ np.asarray(ddata["icoord"])[:, 0], # x at bottom-right corner np.asarray(ddata["dcoord"])[:, 0], # y at bottom-right corner ]) br_pts = np.array([ np.asarray(ddata["icoord"])[:, -1], # x at bottom-right corner np.asarray(ddata["dcoord"])[:, -1], # y at bottom-right corner ]) leaf_pts = np.append(bl_pts, br_pts, axis=1) # remove pts where y != 0 as these mark joins within the diagram and don't # connect to the edge leaf_pts = leaf_pts[:, ~(leaf_pts[1] > 0)] # sort by x-coordinate for leaf labels, so that the positions are in the # same order as the axis labels leaf_pts = leaf_pts[:, leaf_pts[0, :].argsort()] # get the actual leaf from the indecies (these were set by providing the # `leaf_label_func` above) leaf_indecies_from_labels = np.array( [int(lab.get_text()) for lab in ax.get_xticklabels()]) # create mapping from the leaf indecies to the (x,y)-points in the # dendrogram where these leaves terminate leaf_pts_mapping = dict(zip(leaf_indecies_from_labels, leaf_pts.T)) # work out which leaf each item (image) belongs to mapping = dict(zip(M, L)) leaf_mapping = np.array(list(map(lambda i: mapping[i], T))) N_leaves = len(np.unique(leaf_mapping)) # counts per leaf # [(n, sum(leaf_mapping == n)) for n in L] w_pad = 0.02 size = (3.6 - (n_clusters_max - 1.0) * w_pad) / float(n_clusters_max) y_offset = 1.4 if label_clusters: y_offset += 0.2 for lid, leaf_id in enumerate(ddata["leaves"]): img_idxs_in_cluster = da_embeddings.tile_id.values[leaf_mapping == leaf_id].astype(int) if sampling_method == "random": try: img_idxs = np.random.choice(img_idxs_in_cluster, size=n_samples, replace=False) except ValueError: img_idxs = img_idxs_in_cluster elif sampling_method == "center_dist": emb_in_cluster = da_embeddings.sel(tile_id=img_idxs_in_cluster) d_emb = emb_in_cluster.mean(dim="tile_id") - emb_in_cluster center_dist = np.sqrt(d_emb**2.0).sum(dim="emb_dim") emb_in_cluster["dist_to_center"] = center_dist img_idxs = emb_in_cluster.sortby( "dist_to_center").tile_id.values[:n_samples] else: raise NotImplementedError(sampling_method) def transform(coord): axis_to_data = fig.transFigure + ax.transData.inverted() data_to_axis = axis_to_data.inverted() return data_to_axis.transform(coord) leaf_xy = leaf_pts_mapping[leaf_id] xp, yh = transform(leaf_xy) if show_legend: ax.scatter(*leaf_xy, marker="s", label=lid, s=100) for n, img_idx in enumerate(img_idxs): img = tile_dataset.get_image(index=img_idx) ax1 = fig.add_axes([ xp - 0.5 * size, yh - size * 1.1 * (n + y_offset), size, size ]) ax1.set_aspect(1) ax1.axison = False ax1.imshow(img) ax.set_xticklabels( _fix_labels(ax=ax, leaf_mapping=leaf_mapping, label_clusters=label_clusters)) if show_legend: ax.legend() if return_clusters: # instead of returning the actual indecies of the leaves here (as were # used above) we remap so that they run from 0...N_leaves leaf_idxs_remapped = np.array( [list(leaf_indecies_from_labels).index(i) for i in leaf_mapping]) if not label_clusters: return ax, leaf_idxs_remapped else: return ax, _make_letter_labels(N_leaves)[leaf_idxs_remapped] else: return ax
def cut_tree(self, t=None, criterion='inconsistent', depth=None, cluster_min=3): """ Groups data into clusters based on the linkage matrix. --Input-- t: float Threshold to be used by the criterion. criterion: str Criterion for grouping (cutting) the dendrogram. Can be either 'distance' or 'inconsistency'. depth: int Depth used when calculating inconsistency coefficient of a branch. Uses all banches by default. cluster_min: int Minimum cluster size. Data in clusters below this size will be assigned as outliers. --Output-- labels: list List of cluster labels (integers) for each data. clusters: list Atoms objects grouped after cluster labels. The first group (list) is all the outliers. branches: list The branch index of each cluster. centroids: list The centroids of all clusters calculated as averaged features. cluster_energies: list The average energy of outliers (first element - empty if no outliers) and structures in the clusters. Is an empty list if data has no .get_potential_energy() attribute. The first energy is the average energy of the outliers. This is an empty list if there are no outliers. avg_width: float Average cluster width. Can be used as an outlier threshold in the assign_to_cluster function. """ if t is None: if criterion == 'distance': t = 0.7 * max(self.linkage_matrix[:, 2]) elif criterion == 'inconsistent': t = 4.0 if depth is None: depth = self.n_data labels = fcluster(self.linkage_matrix, t, criterion, depth) branches = leaders(self.linkage_matrix, labels) # Data in clusters below cluster_size are outliers with label = 0 for label in sorted(set(labels)): cluster_size = sum([i == label for i in labels]) if cluster_size < cluster_min: labels = np.where(labels == label, 0, labels) x = np.delete(branches[0], np.where(branches[1] == label)) y = np.delete(branches[1], np.where(branches[1] == label)) branches = (x, y) # Rearrange labels to fill gaps from assignment of outliers for label in sorted(set(labels)): if label < 2: continue while len(np.where(labels == label - 1)[0]) == 0: y = np.where(branches[1] == label, label - 1, branches[1]) branches = (branches[0], y) labels = np.where(labels == label, label - 1, labels) label -= 1 n_clusters = max(labels) print 'Number of clusters: {}'.format(n_clusters) # Group data of equal cluster label and calculate centroid clusters = [[] for i in range(n_clusters + 1)] centroids = [[] for i in range(n_clusters)] i = 0 for label, sample in zip(labels, self.data): clusters[label].append(sample) if label > 0: centroids[label - 1].append(self.feature_matrix[i]) i += 1 print 'Number of outliers: {}'.format(len(clusters[0])) for i, c in enumerate(centroids): c_mean = dict() for key in sorted(self.feature_matrix[0].keys()): c_mean[key] = np.mean([x[key] for x in c], axis=0) centroids[i] = c_mean # Determine average width of clusters cluster_widths = [] for branch in branches[0]: width = self.linkage_matrix[branch - self.n_data][-2] cluster_widths.append(width) print 'Cluster widths: {}'.format(cluster_widths) # Calculate average cluster energies cluster_energies = [] for cluster in clusters: try: # Check if data has a potential energy self.data[0].get_potential_energy() except IndexError, SyntaxError: break if len(cluster) == 0: mean_energy = [] else: mean_energy = np.mean( [x.get_potential_energy() for x in cluster]) cluster_energies.append(mean_energy)
def _hierarchical_clustering_post(model, num_clusters, cluster_col='cluster'): if 'linkage_matrix' not in model: model_table = model['table_1'] length = len(model_table) + 1 tmp_table = model_table[[ 'clusters_joined1', 'clusters_joined2', 'height', 'frequency' ]] tmp = [ i for i in tmp_table[['clusters_joined1', 'clusters_joined2' ]].values.flatten() if i.split("_")[0] != 'CL' ] label_encoder = preprocessing.LabelEncoder().fit(tmp) tmp_table['clusters_joined2'] = tmp_table['clusters_joined2'].apply( _change_name, length=length, encoder=label_encoder) tmp_table['clusters_joined1'] = tmp_table['clusters_joined1'].apply( _change_name, length=length, encoder=label_encoder) Z = tmp_table.values predict = fcluster(Z, t=num_clusters, criterion='maxclust') data_names = ['pt_' + str(i) for i in range(length)] prediction_table = pd.DataFrame() prediction_table['name'] = data_names else: Z = model['model'] mode = model['input_mode'] out_table = model['linkage_matrix'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') if mode == 'original': prediction_table = model['table'] elif mode == 'matrix': prediction_table = model['dist_matrix'][['name']] if num_clusters == 1: prediction_table[cluster_col] = [ 1 for _ in range(len(prediction_table.index)) ] else: prediction_table[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] if 'linkage_matrix' not in model: for leader in L: which_cluster.append('CL_' + str(2 * length - 1 - leader)) else: for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append( out_table['joined column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append( out_table['joined column2'][select_indices]) clusters_info_table = pd.DataFrame([]) if num_clusters == 1 and 'linkage_matrix' in model: clusters_info_table[cluster_col] = [1] clusters_info_table['name of clusters'] = [ out_table['name of clusters'][len(Z) - 1] ] clusters_info_table['number of entities'] = [ out_table['number of original'][len(Z) - 1] ] else: clusters_info_table[cluster_col] = M clusters_info_table['name of clusters'] = which_cluster clusters_info_table = clusters_info_table.sort_values(cluster_col) cluster_count = np.bincount(prediction_table[cluster_col]) cluster_count = cluster_count[cluster_count != 0] clusters_info_table['number of entities'] = list(cluster_count) if 'linkage_matrix' in model: rb = BrtcReprBuilder() rb.addMD( strip_margin("""# Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" |### Parameters | |{display_params} | |### Clusters Information | |{clusters_info_table} | """.format(display_params=dict2MD(model['parameters']), clusters_info_table=pandasDF2MD( clusters_info_table, num_rows=len(clusters_info_table.index) + 1)))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin("""# Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" | |### Clusters Information | |{clusters_info_table} | """.format(clusters_info_table=pandasDF2MD( clusters_info_table, num_rows=len(clusters_info_table.index) + 1)))) model = _model_dict('hierarchical_clustering_post_process') model['clusters_info'] = clusters_info_table model['_repr_brtc_'] = rb.get() return {'out_table': prediction_table, 'model': model}
from numpy import * import scipy import scipy.cluster.hierarchy as sch import matplotlib.pylab as plt A = [6,12,18,24,30,42,48] X = array([[6],[12],[18],[24],[30],[42],[48]]) #print type(X) d = sch.distance.pdist(X) Z= sch.linkage(d,method='single') #print Z P =sch.dendrogram(Z) #print P plt.savefig('plot_dendrogram.png') T = sch.fcluster(Z, 0.5*d.max(), 'distance') sch.leaders(Z,T)
from numpy import * import scipy import scipy.cluster.hierarchy as sch import matplotlib.pylab as plt X = loadtxt('cluster_test.txt') # N = len(X) # d = zeros((N,N)) # # for i in range(N): # for j in range(i+1, N): # d[j, i] = d[i, j] = (sum((X[i, :]-X[j, :])**2))**0.5 d = sch.distance.pdist(X) print(d.shape, X.shape) Z = sch.linkage(d, method='complete') P = sch.dendrogram(Z, orientation='right') plt.show() # plt.savefig('plot_dendrogram.png') T = sch.fcluster(Z, 0.5 * d.max(), 'distance') sch.leaders(Z, T)
def adaptive_heartbeat_modelling(signal=None, sampling_rate=1000., initial_length=0.6, residual_threshold=0.35, show=True): """Adaptive Heartbeat Modelling. Follows the approach by Paalasmaa et al. [Paal14]_. Only suitable here for 15s-long BCG. Parameters ---------- signal : array Input unfiltered BCG signal. sampling_rate : int, float, optional Sampling frequency (Hz). initial_length : float, optional Initial length of the template. residual_threshold : Threshold for heartbeat intervals selection. Returns ------- template : array Heartbeat model. peaks : array Heartbeats location indices. References ---------- .. [Paal14] J. Paalasmaa, H. Toivonen, M. Partinen, "Adaptive heartbeat modeling for beat-to-beat heart rate measurement in ballistocardiograms", IEEE journal of biomedical and health informatics, 2015 """ # check inputs if signal is None: raise TypeError("Please specify an input signal.") # ensure numpy signal = np.array(signal) sampling_rate = float(sampling_rate) #preprocessing signal -= np.mean(signal) filtered, _, _ = st.filter_signal(signal=signal, ftype='butter', band='lowpass', order=2, frequency=10, sampling_rate=sampling_rate) gaussian_filter_std = 0.1 filtered -= si.gaussian_filter(filtered, gaussian_filter_std * sampling_rate) #D. Initial estimation of the heartbeat model #clustering filtered_grad = np.gradient(filtered) windows_center_p, _ = ss.find_peaks(filtered_grad) windows_center_n, _ = ss.find_peaks(-filtered_grad) windows_center = np.sort( np.concatenate((windows_center_p, windows_center_n))) windows, windows_center = extract_heartbeats(signal=filtered, peaks=windows_center, sampling_rate=sampling_rate, before=initial_length / 2, after=initial_length / 2) #clustering dist_matrix = ssd.pdist(windows) n = len(windows) linkage_matrix = sch.linkage(dist_matrix, method='complete') densest_4_cluster_indices, = np.where(linkage_matrix[:, 3] == 4) densest_4_cluster_index = densest_4_cluster_indices[0] leader_node = densest_4_cluster_index + n max_inconsistent_value = linkage_matrix[densest_4_cluster_index, 2] flat_clusters = sch.fcluster(linkage_matrix, max_inconsistent_value, criterion='distance') L, M = sch.leaders(linkage_matrix, flat_clusters) leaves, = np.where(flat_clusters == M[L == leader_node]) windows, windows_center = extract_heartbeats(signal=filtered, peaks=windows_center[leaves], sampling_rate=sampling_rate, before=1.25, after=1.25) mu = np.mean(windows, axis=0) hvs_result = modified_heart_valve_signal(signal=signal, sampling_rate=sampling_rate) hvs = hvs_result['hvs'] hvs_minima, _ = ss.find_peaks(-hvs) half_lengths = [] for center in windows_center: half_lengths.append(min(center - hvs_minima[hvs_minima < center])) half_lengths.append(min(hvs_minima[hvs_minima > center] - center)) half_len = min(half_lengths) mu = mu[int(len(mu) / 2) - half_len:int(len(mu) / 2) + half_len] mu_center = int(len(mu) / 2) #E/ Detecting heartbeat position candidates peaks = [] ta = [] tb = [] for iter in range(2): peaks = [] ta = [] tb = [] half_len = int(initial_length * sampling_rate / 2) if (half_len > mu_center) | (half_len > len(mu) - mu_center): raise ValueError('Template is too short or badly centered') mu_corr = mu[mu_center - half_len:mu_center + half_len] corr = matchTemplate(filtered.astype('float32'), mu_corr.astype('float32'), TM_CCORR_NORMED) corr = corr.flatten() candidates_pos, _ = ss.find_peaks(corr) corr_delay = -mu_center + half_len #F/Detecting beat-to-beat intervals half_len = int(1 * sampling_rate) if half_len > len(mu) - mu_center: mu2 = np.append(mu, np.zeros(2 * half_len - len(mu))) else: mu2 = mu[:int(2 * sampling_rate)] candidates_pos += corr_delay candidates_pos = candidates_pos[candidates_pos >= 0] #1) Initialize ta to the first candidate position ta_cand = candidates_pos[0] while ta_cand < candidates_pos[-1]: try: if ta_cand + int(2 * sampling_rate) > len(filtered): raise Exception sa = filtered[ta_cand:ta_cand + int(2 * sampling_rate)] za = so.least_squares( lambda z: np.mean(np.power(sa - z * mu2, 2)), 1).x[0] xa = za * mu2 #2) Find candidates for tb tb_candidates = candidates_pos[np.logical_and( ta_cand + int(0.4 * sampling_rate) < candidates_pos, candidates_pos < ta_cand + int(2 * sampling_rate))] #3) find best tb or find another ta -> step 2) for tb_cand in tb_candidates: if tb_cand + int(2 * sampling_rate) > len(filtered): raise Exception sb = filtered[tb_cand:tb_cand + int(2 * sampling_rate)] zb = so.least_squares( lambda z: np.mean(np.power(sb - z * mu2, 2)), 1).x[0] xb = zb * mu2 xa_tmp = np.concatenate( (xa, np.zeros( max([ 0, 2 * (tb_cand - ta_cand) - int(2 * sampling_rate) ])))) xb_tmp = np.concatenate((np.zeros(tb_cand - ta_cand), xb)) x = xa_tmp[:2 * (tb_cand - ta_cand)] + xb_tmp[:2 * (tb_cand - ta_cand)] s = filtered[ta_cand:ta_cand + 2 * (tb_cand - ta_cand)] eps = s - x if (np.mean(np.power(eps, 2)) < residual_threshold * np.mean(np.power(s, 2))) & ( max([za, zb]) < 2 * min([za, zb])): ta.append(ta_cand) tb.append(tb_cand) peak_a = ta_cand + mu_center peak_b = tb_cand + mu_center if peak_a not in peaks: peaks.append(peak_a) peaks.append(peak_b) ta_cand = tb_cand break else: continue if ta_cand != tb_cand: ta_candidates = candidates_pos[np.logical_and( candidates_pos > ta_cand, candidates_pos < ta_cand + int(2 * sampling_rate))] ta_cand = ta_candidates[np.argmax(corr[ta_candidates - corr_delay])] except Exception: break beats = dict(peaks=np.array(peaks), ta=np.array(ta), tb=np.array(tb)) #G. re-estimation of the model with detected beat to beat intervals template_extraction = long_template_extraction(signal=filtered, beats=beats, mu_center=mu_center, sampling_rate=1000.) try: mu = template_extraction['long_template'] mu_center_new = template_extraction['long_template_center'] mu = mu[mu_center_new - mu_center:] except KeyError: mu = template_extraction['short_template'] peaks = beats['peaks'] print('iteration no ', iter, ': ', len(peaks), ' beats detected') #H. Accounting for abrupt changes of the heartbeat shape # to complete, with four different instances of the beat-to-beat detection method #I. Post-preprocessing # slightly different in our case : we added a smoother rather than the non linear filter explained in the paper if show: # extract templates templates, peaks = extract_heartbeats(signal=filtered, peaks=peaks, sampling_rate=sampling_rate, before=0.6, after=0.2) # compute heart rate hr_idx, hr = st.get_heart_rate(beats=peaks, sampling_rate=sampling_rate, smooth=True, size=3) # get time vectors length = len(signal) T = (length - 1) / sampling_rate ts = np.linspace(0, T, length, endpoint=True) ts_hr = ts[hr_idx] ts_tmpl = np.linspace(-0.4, 0.4, templates.shape[1], endpoint=False) plotting.plot_bcg(ts=ts, raw=signal, filtered=filtered, jpeaks=peaks, templates_ts=ts_tmpl, templates=templates, heart_rate_ts=ts_hr, heart_rate=hr, path=None, show=True) return utils.ReturnTuple((mu, peaks), ('template', 'peaks'))
def run_hdbscan_hai(hs, labels, method='hai', show='save', cut=0, colormap=plt.cm.gist_rainbow): clusterer = hdbscan.HDBSCAN(metric='precomputed', match_reference_implementation=True, min_samples=1) clusterer.fit(hs) # Labels of objects as extracted from HDBSCAN* cluster_labels = clusterer.labels_ print("Labels: ", cluster_labels) print("Unique Labels: ", np.unique(cluster_labels)) # Linkage Matrix Z = clusterer.single_linkage_tree_.to_numpy() roots = leaders(Z, np.asarray(cluster_labels).astype('i')) print "Roots", map(str, roots[0]) print "Roots", ', '.join(map(str, roots[0])) fosc_file = open(basedir + "/FOSC", "w+") fosc_file.write(', '.join(map(str, roots[0]))) fosc_file.close() # Plot Settings # fig, ax1 = plt.subplots() # plt.title('HDBSCAN*') # plt.xlabel('mpts') # plt.ylabel('distance') # Extraction Method: FOSC or Thresholfd. if cut > 0: partitioning = fcluster(Z, cut, criterion='distance') # plt.axhline(y=cut, c='k') else: partitioning = cluster_labels + 1 # Normalizes the colors according to the clusters found in the partitioning. norm = colors.Normalize(0, partitioning.max()) dflt_col = "#cccccc" link_cols = {} for i, i12 in enumerate(Z[:, :2].astype(int)): c1, c2 = (link_cols[x] if x > len(Z) else dflt_col if partitioning[x] == 0 else colormap(norm(partitioning[x])) for x in i12) link_cols[i + 1 + len(Z)] = c1 if c1 == c2 else dflt_col # Creates the dendrogram. dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=6., # font size for the x axis labels labels=labels, count_sort=True, link_color_func=lambda x: colors.to_hex(link_cols[x]), above_threshold_color='grey') # Saves or shows the dendrogram. # if show == 'save': # plt.savefig(plotdir + filename + '_hdbscan_' + method + '.png', dpi=600, bbox_inches='tight') # plt.savefig(plotdir + filename + '_hdbscan_' + method + '.pdf', dpi=600, bbox_inches='tight') # else: # plt.show() # Clears plot. # plt.gcf().clear() clusters, _ = np.unique(partitioning, return_counts=True) h = np.array(hs) medoids = [] for c in clusters: if c != 0: medoids.append(compute_medoid(c, partitioning, h)) medoids.sort() # Linkage Matrix in a Tree Format T = to_tree(Z) d3Dendro = dict(name=T.id, y=T.dist) add_node(T, d3Dendro, labels, h) json.dump(d3Dendro, open(resudir + filename + '_meta-hierarchy_.json', "w"), sort_keys=True, indent=4) return medoids, labels[medoids], partitioning[medoids], clusters.max()
def fcluster_combine_leaves(Z, t, criterion="distance", depth=2, R=None, monocrit=None): # AKA no leaf left behind # check if Z is a valid linkage matrix _ = hierarchy.is_valid_linkage(Z, throw=True) N = Z.shape[0] + 1 # alternative: iteratively increase t, check for remaining leaves # move up the tree, merging leaf clusters until all leaves are merged into clusters T = hierarchy.fcluster(Z, t, criterion=criterion, depth=depth, R=R, monocrit=monocrit) L, M = hierarchy.leaders(Z, T) leaf_leaders = list(L[L < N]) # no leaf clusters if len(leaf_leaders) == 0: return T max_cluster = T.max() # iterate through all links for n, link in enumerate( Z[np.logical_or(*(np.in1d(Z[:, l], leaf_leaders) for l in range(2))), :2].astype("i")): if n % 10 == 0: print( f"After {n} iterations, {len(leaf_leaders)} leaf leaders left with {len(np.unique(T))} total clusters" ) # find linkages if link is between two leaf_leaders if all([l in leaf_leaders for l in link]): # make new cluster of leaf leaders max_cluster += 1 T[link] = max_cluster # remove from list of leaf_leaders _ = [leaf_leaders.remove(l) for l in link] # find linkages of leaf leaders with any non-leaf node elif any([l in leaf_leaders for l in link]): # which one is the leaf leader? node_index = link[0] in leaf_leaders node, leaf = link[int(node_index)], link[int(~node_index)] # other node is a leader if node in L: downstream_leaders = [node] # node is not a leader, have to traverse down the tree until leaders are found else: # get hierarchy.ClusterNode representation of the node tree = hierarchy.to_tree(Z, rd=True)[1][node] def check_node(node, nodes_to_check, downstream_leaders, L): """check if a node is a leader, else append successors to nodes_to_check""" if node.id in L: downstream_leaders.append(node.id) else: nodes_to_check.extend([node.left, node.right]) return nodes_to_check, downstream_leaders # initialize traversal downstream_leaders = [] nodes_to_check = [tree.left, tree.right] while len(nodes_to_check) > 0: n_ = nodes_to_check.pop(0) if all([s is None for s in [n_.left, n_.right]]): raise ValueError( "While traversing the tree, a leaf node was reached" f", node {n_.id}. In theory this should not occur." ) nodes_to_check, downstream_leaders = check_node( n_, nodes_to_check, downstream_leaders, L) # update T max_cluster += 1 merge_clusters = M[np.in1d(L, downstream_leaders)] T[np.in1d(T, merge_clusters)] = max_cluster T[leaf] = max_cluster # remove from leaf_leaders _ = leaf_leaders.remove(leaf) else: continue # update L,M L, M = hierarchy.leaders(Z, T) if len(leaf_leaders) == 0: break leaf_leaders = list(L[L < N]) # no leaf clusters if len(leaf_leaders) == 0: print( f"All leaf leaders combined, resulting in {len(np.unique(T))} total clusters" ) # relabel unique, inverse = np.unique(T, return_inverse=True) return np.arange(0, unique.shape[0])[inverse] else: raise ValueError(f"Failed to merge leaf leaders {leaf_leaders}")