step = 1 period = 60 while True: split = his_df[idx:idx + period] split = split.reset_index() if idx > len(his_df) or len(split) < period: break train_df = train_df.assign( idx=split['Close'] / split.tail(1)['Close'].values - 1) train_df = train_df.rename(columns={'idx': idx}) idx += step train_df = train_df.T #전치 train_np = train_df.to_numpy() ds = dtw.distance_matrix_fast(train_np, block=((0, 1), (1, len(train_np))), compact=True) ds_array = np.array(ds) ds_array = np.delete(ds_array, range(60), axis=0) #target 주변 삭제 temp_value = [] temp_index = [] store = {} rtn_store = {} scale = period bar_data = pd.DataFrame() #get 10 for i in range(10): temp_value.append(min(ds_array))
## For storing boxplots data median_dist = [] min_dist = [] max_dist = [] q1_dist = [] q3_dist = [] ## For storing violin plot data mean_dist_all = [] ## calculating distance matrices for t-SNE series_data = [] for i in data_all.index: val = map(int, data_all.loc[i, 'values'].split("_")) series_data.append(np.array(list(val), dtype=np.double)) ds = dtw.distance_matrix_fast(series_data, penalty=penalty) ds[np.tril_indices(ds.shape[0], k=-1)] = ds.T[np.tril_indices(ds.shape[0], k=-1)] np.fill_diagonal(ds, 0) ds = pd.DataFrame(ds) ds.index = data_all['kmer'] ds.columns = data_all['kmer'] os.makedirs(out_folder + '/distance_matrices/') os.makedirs(out_folder + '/raw_signal/') for kmer_row in data_all['kmer'].unique(): for kmer_column in data_all['kmer'].unique(): current_ds = ds.loc[[kmer_row], [kmer_column]] current_ds.columns = range(0, len(current_ds.columns))
def x2p(X=np.array([]), tol=1e-5, perplexity=30.0): """ Performs a binary search to get P-values in such a way that each conditional Gaussian has the same perplexity. """ # Initialize some variables print("Computing pairwise distances...") # https://stackoverflow.com/questions/37009647/compute-pairwise-distance-in-a-batch-without-replicating-tensor-in-tensorflow (n, d) = X.shape # sum_X = np.sum(np.square(X), 1) # D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) D = dtw.distance_matrix_fast(X) print(D.shape) P = np.zeros((n, n)) beta = np.ones((n, 1)) logU = np.log(perplexity) # Loop over all datapoints for i in range(n): # Print progress if i % 500 == 0: print("Computing P-values for point %d of %d..." % (i, n)) # Compute the Gaussian kernel and entropy for the current precision betamin = -np.inf betamax = np.inf Di = D[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] (H, thisP) = Hbeta(Di, beta[i]) # Evaluate whether the perplexity is within tolerance Hdiff = H - logU tries = 0 while np.abs(Hdiff) > tol and tries < 50: # If not, increase or decrease precision if Hdiff > 0: betamin = beta[i].copy() if betamax == np.inf or betamax == -np.inf: beta[i] = beta[i] * 2. else: beta[i] = (beta[i] + betamax) / 2. else: betamax = beta[i].copy() if betamin == np.inf or betamin == -np.inf: beta[i] = beta[i] / 2. else: beta[i] = (beta[i] + betamin) / 2. # Recompute the values (H, thisP) = Hbeta(Di, beta[i]) Hdiff = H - logU tries += 1 # Set the final row of P P[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] = thisP # Return final P-matrix print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta))) return P
import numpy as np from numpy import inf import pandas as pd import matplotlib.pyplot as plt from dtaidistance import clustering import sklearn from sklearn import cluster df = pd.read_csv("Scania_Data_Clustering.csv", header=0) # header=0 is default head = list(df.columns.values) # get machine names print("head", head) # print machine names df = df.T # transpose the data df = df.values ds = dtw.distance_matrix_fast(df) # get dist matrix ds[ds == inf] = 0 # replace all infinity vals in the dist matrix with 0. pd.DataFrame(ds).to_excel("ds.xlsx") # save dist matrix to a xlsx. # clustering starts # Custom Hierarchical clustering model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {}) # Augment Hierarchical object to keep track of the full tree model2 = clustering.HierarchicalTree(model1) # SciPy linkage clustering model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {}) cluster_idx = model3.fit(df) # plot
def cluster_the_ts_curves(infile, outfolder, maturity, smoothing): series = {} venues = [] indicies = [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5)] for ind, line in enumerate(open(infile)): fields = line.strip().split('\t') venue = fields[0] ts = fields[1:] venues.append(venue) #if ind == 500: break if smoothing == 'smooth': series[venue] = savgol_filter( np.asarray([float(fff) for fff in ts]), 5, 3) elif smoothing == 'notsmooth': series[venue] = np.asarray([float(fff) for fff in ts]) else: print('F**K OFF') dists = dtw.distance_matrix_fast(list(series.values())) model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {}) cluster_idx = model3.fit(list(series.values())) linkage_matrix = model3.linkage nnn = len(series) cluster_dict = {} if not os.path.exists(maturity): os.makedirs(maturity) for i in range(0, nnn - 1): new_cluster_id = nnn + i old_cluster_id_0 = linkage_matrix[i, 0] old_cluster_id_1 = linkage_matrix[i, 1] combined_ids = list() if old_cluster_id_0 in cluster_dict: combined_ids += cluster_dict[old_cluster_id_0] del cluster_dict[old_cluster_id_0] else: combined_ids += [old_cluster_id_0] if old_cluster_id_1 in cluster_dict: combined_ids += cluster_dict[old_cluster_id_1] del cluster_dict[old_cluster_id_1] else: combined_ids += [old_cluster_id_1] cluster_dict[new_cluster_id] = combined_ids nodes_included = [] for v in cluster_dict.values(): nodes_included += v nc = len(cluster_dict) nnodes = len(set(nodes_included)) #for NNN in [6]: #for NNN in [3, 5, 6, 10]: for NNN in [10]: #NNN = 6 # 5 # 6 # 10 figfolder = outfolder + '/' + maturity + '/figs_clusters_' + smoothing + '/' + str( NNN) curvefodler = outfolder + '/' + maturity + '/avg_curves_' + smoothing + '/' + str( NNN) vensfolder = outfolder + '/' + maturity + '/clusters_venues_' + smoothing + '/' + str( NNN) if not os.path.exists(figfolder): os.makedirs(figfolder) if not os.path.exists(curvefodler): os.makedirs(curvefodler) if not os.path.exists(vensfolder): os.makedirs(vensfolder) MINCSIZE = 100 MAXSIZE = len(series) / 2 cnt = [(c, len(n)) for (c, n) in cluster_dict.items() if len(n) > MINCSIZE and len(n) < MAXSIZE] num = min(len(cnt), NNN) cnt = sorted(cnt, key=lambda tup: tup[1], reverse=True)[0:num] biggest = sum([cc[1] for cc in cnt]) top5cluster = [c[0] for c in cnt] if biggest > len(series) / 2: f, ax = plt.subplots(2, 5, figsize=(20, 8)) ind = 0 for ccc, nodes in cluster_dict.items(): if ccc in top5cluster: ttt = [] sss = [] cluster_vens = [] subseries = [] for n in nodes: subseries.append(list(series.values())[int(n)]) sss += list(list(series.values())[int(n)]) ttt += transform_ts( list(range(len(list( series.values())[int(n)]))), 11) for n in nodes: cluster_vens.append(list(series.keys())[int(n)]) linetotplot = list(series.values())[int(n)] xlinetotplot = transform_ts( list(range(len(list( series.values())[int(n)]))), 11) ax[indicies[ind]].plot(xlinetotplot, linetotplot, linewidth=0.4, color='grey', alpha=0.15) ffout = open( vensfolder + '/venues_in_' + str(ind) + '_' + str(biggest) + '_venuesnum=' + str(len(subseries)) + '.dat', 'w') ffout.write('\n'.join(cluster_vens)) ffout.close() ax[indicies[ind]].set_title('Number of venues = ' + str(len(subseries)), fontsize=15) bx, by = getBinnedDistribution(ttt, sss, 8) bx = (bx[1:] + bx[:-1]) / 2 fout = open( curvefodler + '/avg_curve_' + str(ind) + '_' + str(biggest) + '_venuesnum=' + str(len(subseries)) + '.dat', 'w') fout.write('\t'.join([str(b) for b in bx]) + '\n') fout.write('\t'.join([str(b) for b in by]) + '\n') fout.close() ax[indicies[ind]].plot(bx, by, linewidth=3, color='r') ind += 1 plt.savefig(figfolder + '/top_' + str(NNN) + '_clusters_' + str(biggest) + '.png') plt.close()
# F ts_d1 = np.array(data[0][3]) ts_d1 = ts_d1.reshape([ts_d1.shape[0], ts_d1.shape[1]]) ts_d2 = np.array(data[1][3]) ts_d2 = ts_d2.reshape([ts_d2.shape[0], ts_d2.shape[1]]) ts_d3 = np.array(data[2][3]) ts_d3 = ts_d3.reshape([ts_d3.shape[0], ts_d3.shape[1]]) ts_d4 = np.array(data[3][3]) ts_d4 = ts_d4.reshape([ts_d4.shape[0], ts_d4.shape[1]]) num_d1 = ts_d1.shape[0] num_d2 = ts_d2.shape[0] num_d3 = ts_d3.shape[0] num_d4 = ts_d4.shape[0] ts = np.concatenate([ts_d1, ts_d2, ts_d3, ts_d4], axis=0) ds = dtw.distance_matrix_fast(ts) ds_d1 = ds[:num_d1, :num_d1] ds_d1_d2 = ds[:num_d1, num_d1:num_d1 + num_d2] ds_d1 = ds_d1.flatten() ds_d1_d2 = ds_d1_d2.flatten() sns.distplot(ds_d1, hist=False, rug=True, color="g") sns.distplot(ds_d1_d2, hist=False, rug=True, color="m") plt.show()
#if ind == 50: break series.append( np.asarray([float(fff) for fff in line.strip().split('\t')])) for ind, line in enumerate(open('TIMESERIES_tims.dat')): #if ind == 50: break tseries.append( np.asarray([float(fff) for fff in line.strip().split('\t')])) print (len(series)) dists = dtw.distance_matrix_fast(series) # model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {}) # Augment Hierarchical object to keep track of the full tree # model2 = clustering.HierarchicalTree(model1) # SciPy linkage clustering model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {}) cluster_idx = model3.fit(series) #print (dir(model3)) #print (model3.linkage)
def distance_fast(c_series, ic, jc, subim, S, m, rmin, cmin, window=None, max_dist=None, max_step=None, max_diff=None, penalty=None, psi=None): """This function computes the spatial-temporal distance between \ two pixels using the dtw distance with C implementation. :param c_series: average time series of cluster. :type c_series: numpy.ndarray :param ic: X coordinate of cluster center. :type ic: int :param jc: Y coordinate of cluster center. :type jc: int :param subim: Block of image from the cluster under analysis. :type subim: int :param S: Pattern spacing value. :type S: int :param m: Compactness value. :type m: float :param rmin: Minimum row. :type rmin: int :param cmin: Minimum column. :type cmin: int :param window: Only allow for maximal shifts from the two diagonals \ smaller than this number. It includes the diagonal, meaning that an \ Euclidean distance is obtained by setting window=1. :param max_dist: Stop if the returned values will be larger than \ this value. :param max_step: Do not allow steps larger than this value. :param max_diff: Return infinity if length of two series is larger. :param penalty: Penalty to add if compression or expansion is applied. :param psi: Psi relaxation parameter (ignore start and end of matching). Useful for cyclical series. :returns D: numpy.ndarray distance. """ from dtaidistance import dtw # Normalizing factor m = m / 10 # Initialize submatrix ds = numpy.zeros([subim.shape[1], subim.shape[2]]) # Tranpose matrix to allow dtw fast computation with dtaidistance linear = subim.transpose(1, 2, 0).reshape(subim.shape[1] * subim.shape[2], subim.shape[0]) merge = numpy.vstack((linear, c_series)).astype(numpy.double) # Compute dtw distances c = dtw.distance_matrix_fast(merge, block=((0, merge.shape[0]), (merge.shape[0] - 1, merge.shape[0])), compact=True, parallel=True, window=window, max_dist=max_dist, max_step=max_step, max_length_diff=max_diff, penalty=penalty, psi=psi) c1 = numpy.frombuffer(c) dc = c1.reshape(subim.shape[1], subim.shape[2]) x = numpy.arange(subim.shape[1]) y = numpy.arange(subim.shape[2]) xx, yy = numpy.meshgrid(x, y, sparse=True, indexing='ij') # Calculate Spatial Distance ds = (((xx - ic)**2 + (yy - jc)**2)**0.5) # Calculate SPatial-temporal distance D = (dc) / m + (ds / S) return D
def _cluster(df): flow_df = df.copy() sites = df['Site'].to_list() sites_len = len(sites) df = df.fillna(0).drop(columns=["Site", "Flow"]) df = df.to_numpy() try: distance = dtw.distance_matrix_fast(df, compact=True) except Exception as e: print('Distance calculation failed, shoudnt continue') exit(99) distance_ssd = ssd.squareform(distance) # Hierarchical clustering - linkage matrix Z Z = linkage(distance_ssd, "average") # Inconsistent matrix - has mean-distance, standard dev's for each linkage IN = inconsistent(Z) # Creating a temporary data-frame to extract clusters from linkage and inconsistent matrices cols = ['pt1', 'pt2', 'dist', 'tot_pts', 'mean_dist', 'SD_dist', 'cls_level', 'co_eff'] temp_df = pd.DataFrame(np.hstack([Z, IN]), columns=cols) # get the bin's - only using the range from the first level clustering distances # Further clustering level will increase linkages' mean distance # points that fall above first level mean-distances are deemed as outliers cls_level_1_distances = temp_df.loc[temp_df['cls_level'] == 1, 'mean_dist'] q1, q3 = np.percentile(cls_level_1_distances, [25, 75]) IQR = q3 - q1 # Handy formula to calculate bin width - to make sure bin counts are minimal but represents the spread well bw = 2 * IQR/ int(round(sites_len ** (1. / 3))) * BIN_FACTOR bins_ = (np.arange(min(cls_level_1_distances)- 0.1, max(cls_level_1_distances) + bw, bw)) # hierarchical clustering groups data till it reaches a single cluster that has all data points # we don't need rows from linkage matrix, which represents higher level clustering, # keeping link rows only the leaf_nodes (i.e single site data point) temp_df = temp_df[(temp_df['pt1'] < sites_len) | (temp_df['pt2'] < sites_len) ] # apply the bins temp_df['bins'] = pd.cut(temp_df['mean_dist'], bins_ ).astype('str') # Map digits to intervals , for readability map_dict = {str(value):counter for counter, value in enumerate(temp_df['bins'].unique()) if value != 'nan'} temp_df['Cluster'] = temp_df['bins'].map(map_dict) # NaNs are the outliers , treat them as singleton cluster, giving name to each NAN total_nans = (temp_df['Cluster'].isna().sum()) temp_df.loc[temp_df['Cluster'].isna(), 'Cluster'] = [ 'O' + str(i) for i in range(1,total_nans+1) ] # Combine linkage matrix columns - to create a single column view of, site vs cluster mapping df1 = temp_df.loc[temp_df['pt1'] < sites_len, ['pt1', 'Cluster']].rename(columns={'pt1':'Site'}).copy() df2 = temp_df.loc[temp_df['pt2'] < sites_len, ['pt2', 'Cluster']].rename(columns={'pt2':'Site'}).copy() temp_df = pd.concat([df1, df2]).sort_values(by='Site').reset_index(drop=True) flow_df['Cluster'] = temp_df['Cluster'] # # visualizing # sites_n = [(str(site) + '-' + str(i)) for site, i in enumerate(sites)] # fig, ax = plt.subplots() # fig.set_size_inches(20,40) # dend = dendrogram(Z, leaf_rotation=90, leaf_font_size=8, labels=sites_n, ax=ax) # plt.show() return flow_df