def _update_centroids(self, X): if self.metric_params is None: metric_params = {} else: metric_params = self.metric_params.copy() if "gamma_sdtw" in metric_params.keys(): metric_params["gamma"] = metric_params["gamma_sdtw"] del metric_params["gamma_sdtw"] for k in range(self.n_clusters): if self.metric == "dtw": self.cluster_centers_[k] = dtw_barycenter_averaging( X=X[self.labels_ == k], barycenter_size=None, init_barycenter=self.cluster_centers_[k], metric_params=metric_params, verbose=False) elif self.metric == "softdtw": self.cluster_centers_[k] = softdtw_barycenter( X=X[self.labels_ == k], max_iter=self.max_iter_barycenter, init=self.cluster_centers_[k], **metric_params) else: self.cluster_centers_[k] = euclidean_barycenter( X=X[self.labels_ == k])
def template_signal(signals): template_signals = [] for participant in signals: template_signals.append(dtw_barycenter_averaging(participant, max_iter=0)) return template_signals
def plot_som_series_dba_center(som_x, som_y, win_map): fig, axs = plt.subplots(som_x, som_y, figsize=(25, 25)) fig.suptitle('Clusters') for x in range(som_x): for y in range(som_y): cluster = (x, y) if cluster in win_map.keys(): for series in win_map[cluster]: axs[cluster].plot(series, c="gray", alpha=0.5) axs[cluster].plot(dtw_barycenter_averaging( np.vstack(win_map[cluster])), c="red") # I changed this part cluster_number = x * som_y + y + 1 axs[cluster].set_title(f"Cluster {cluster_number}") plt.show()
def barycenter_imputation(missing_set, n_partitions=24, n_individuals=10): imputted = missing_set.copy() for ind in range(missing_set.shape[0]): missing_voxels = np.argwhere(np.isnan(missing_set[ind, :, 0])) complete_voxels = np.argwhere(1 - np.isnan(missing_set[ind, :, 0])) average_bary_center = dtw_barycenter_averaging( missing_set[ind, complete_voxels[:, 0], :]) imputted[ind, missing_voxels[:, 0], :] = average_bary_center imputted = data_utils.compress_maintain_dim(imputted, n_partitions=n_partitions, n_individuals=n_individuals) return imputted.reshape(imputted.shape[:-1])
def ts_average(x, pid_indices): ''' Input: x: [[time series 1], ... , [time series n]] pid_indices: {pid:[indicies]} Output: Average time series for each unique PID in PID_indices The pid label for each of the average time series ''' dtw_avg = [] dtw_avg_pid = [] for i in pid_indices: dtw_avg.append(dtw_barycenter_averaging([x[j] for j in pid_indices[i]])) dtw_avg_pid.append(i) return dtw_avg, dtw_avg_pid
def _update_centroids(self, X): metric_params = self._get_metric_params() for k in range(self.n_clusters): if self.metric == "dtw": self.cluster_centers_[k] = dtw_barycenter_averaging( X=X[self.labels_ == k], barycenter_size=None, init_barycenter=self.cluster_centers_[k], metric_params=metric_params, verbose=False) elif self.metric == "softdtw": self.cluster_centers_[k] = softdtw_barycenter( X=X[self.labels_ == k], max_iter=self.max_iter_barycenter, init=self.cluster_centers_[k], **metric_params) else: self.cluster_centers_[k] = euclidean_barycenter( X=X[self.labels_ == k])
def _update_centroids(self, X): for k in range(self.n_clusters): if self.metric == "dtw": self.cluster_centers_[k] = dtw_barycenter_averaging( X=X[self.labels_ == k], barycenter_size=None, init_barycenter=self.cluster_centers_[k], verbose=False) # DTWBarycenterAveraging(max_iter=self.max_iter_barycenter, # barycenter_size=None, # init_barycenter=self.cluster_centers_[k], # verbose=False).fit(X[self.labels_ == k]) elif self.metric == "softdtw": self.cluster_centers_[k] = SoftDTWBarycenter( max_iter=self.max_iter_barycenter, gamma=self.gamma_sdtw, init=self.cluster_centers_[k]).fit(X[self.labels_ == k]) else: self.cluster_centers_[k] = EuclideanBarycenter().fit( X[self.labels_ == k])
dtw_barycenter_averaging, softdtw_barycenter from tslearn.datasets import CachedDatasets numpy.random.seed(0) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X = X_train[y_train == 2] plt.figure() plt.subplot(3, 1, 1) for ts in X: plt.plot(ts.ravel(), "k-", alpha=.2) plt.plot(euclidean_barycenter(X).ravel(), "r-", linewidth=2) plt.title("Euclidean barycenter") plt.subplot(3, 1, 2) dba_bar = dtw_barycenter_averaging(X, max_iter=100, verbose=False) for ts in X: plt.plot(ts.ravel(), "k-", alpha=.2) plt.plot(dba_bar.ravel(), "r-", linewidth=2) plt.title("DBA") plt.subplot(3, 1, 3) sdtw_bar = softdtw_barycenter(X, gamma=1., max_iter=100) for ts in X: plt.plot(ts.ravel(), "k-", alpha=.2) plt.plot(sdtw_bar.ravel(), "r-", linewidth=2) plt.title("Soft-DTW barycenter ($\gamma$=1.)") plt.tight_layout() plt.show()
# plt.savefig(fig_path+'dbscan_pairplot_full_samples_solo_L2346', dpi=600, transparent=True) #descomentar para guardar la figura. entre comillas va el nombre print( df.drop([ 'cycle_time', 'errored_cycle', 'encoded_leech_no', 'encoded_video_name' ], axis=1).groupby(['pred', 'cycle_reset']).describe()) #%% good_mask = (df.pred != -1).values dropped_df = df[good_mask] dropped_lengths = transposed_lengths[good_mask] closest_idx = [] for cluster in np.sort(dropped_df.pred.unique()): bc = barycenters.dtw_barycenter_averaging( dropped_lengths[(dropped_df.pred == cluster).values]) closest_idx.append( np.argmin(np.power(binned_lengths - bc.T, 2).sum(axis=(1, 2)))) fig, ax = cvU.plotBinnedLengths(bc.T, zscore_speed=True) fig.suptitle('cluster {}'.format(cluster)) for idx in closest_idx: f, a = cvU.plotBinnedLengths(binned_lengths[idx]) fig.suptitle('idx: {}, cluster: {}'.format(idx, df.pred[idx])) df.drop([ 'cycle_reset', 'errored_cycle', 'encoded_video_name', 'encoded_leech_no' ], axis=1).loc[closest_idx]
import csv, argparse import numpy as np from tslearn.barycenters import dtw_barycenter_averaging parser = argparse.ArgumentParser() parser.add_argument("corpus", help="Type of corpus you're working with") parser.add_argument("language", help="Name of the language you're using") parser.add_argument("gram", help="Unigram or trigram") args = parser.parse_args() assert args.gram in ["unigram", "trigram"], "Only accepts 'unigram' or 'trigram'" ## read in surprisals data X = [] with open("../../ValSurprisals/" + args.corpus + '/' + args.gram + '/' + \ args.language + "_compressed.csv", 'r') as f: reader = csv.reader(f) for row in reader: X.append(row[1:]) # get barycenter for each size hyperparameter value as list for BARYCENTER_SIZE in range(1, 15): X = [[float(item) for item in series if item != "NA"] for series in X] dtw_barycenter_averaging(X = X, barycenter_size = BARYCENTER_SIZE, verbose = True)\ .reshape(BARYCENTER_SIZE).tolist() print(BARYCENTER_SIZE)
# read in surprisals data X = [] weights = [] with open("ValSurprisals/" + args.corpus + '/' + args.gram + '/' + \ args.language + "_compressed.csv", 'r') as f: reader = csv.reader(f) for row in reader: X.append(row[1:]) weights.append(int(row[0])) # get barycenter of info-curves as list data = [[float(item) for item in series if item != "NA"] for series in X] barycenter = dtw_barycenter_averaging(X = data, barycenter_size = BARYCENTER_SIZE, weights = np.array(weights)).reshape(BARYCENTER_SIZE).tolist() barycenter += [args.language, args.corpus, args.gram] # output barycenter to with open(OUTPUT_FILE, 'a') as f: writer = csv.writer(f) writer.writerow(barycenter) # X = [] # with open("ValSurprisals/" + args.corpus + '/' + args.gram + '/' + \ # args.language + "_compressed.csv", 'r') as f: # reader = csv.reader(f) # for row in reader: # X.append(row) # # # get barycenter of info-curves as list
from scipy.cluster.hierarchy import linkage, dendrogram # read in barycenters and filter to unigrams barycenters = pd.read_csv("../Data/5barycenters.csv") barycenters = barycenters[(barycenters["gram"] == "unigram") & \ (barycenters["source"] == "wikipedia")] # get the numerical barycenters centers = barycenters.loc[:, '1':'5'].to_numpy() # linkage is the workhorse: it produces an array with pairwise clusters in the # first two columns, then the distance in the third column, then the number of # members of the created cluster in the fourth column Z = linkage(centers, method="single", metric=dtw) expanded_centers = centers.copy() for row in Z: cluster = np.vstack( [expanded_centers[int(row[0])], expanded_centers[int(row[1])]]) new_center = dtw_barycenter_averaging(cluster) expanded_centers = np.vstack((expanded_centers, new_center.T)) # save data to file ##### FIX WRITING IN WEIRD SCIENTIFIC NOTATION np.savetxt("linkage.txt", Z) np.savetxt("expanded_centers.txt", expanded_centers) languages = barycenters["language"].tolist() with open("languages.txt", 'w') as language_file: for language in languages: language_file.write("%s\n" % language)
import csv import numpy as np import pandas as pd from tslearn.barycenters import dtw_barycenter_averaging ## within gram X = [] with open("Data/5barycenters.csv", 'r') as f: reader = csv.reader(f) for row in reader: X.append(row) ## split X into unigram and trigram unigrams = [series[:5] for series in X if series[-2] == "wikipedia" and \ series[-1] == "unigram"] trigrams = [series[:5] for series in X if series[-2] == "wikipedia" and \ series[-1] == "trigram"] unigram_barycenter = dtw_barycenter_averaging( X=unigrams, verbose=True).reshape(5).tolist() trigram_barycenter = dtw_barycenter_averaging( X=trigrams, verbose=True).reshape(5).tolist() ## within X = pd.read_csv("Data/5barycenters_fam.csv")
genealogy[row[-2]] = [] genealogy[row[-2]].append(row[-1]) # get unique genii genealogy = {family: list(set(genii)) for family, genii in genealogy.items()} barycenters = [] for gram in grams: ## create gram mean barycenter data = [row[:5] for row in X if row[-3] == gram] data = [[float(item) for item in series] for series in data] data = [[el - sum(row) / len(row) for el in row] for row in data] if data != []: barycenter = dtw_barycenter_averaging( X=data, barycenter_size=BARYCENTER_SIZE, verbose=True).reshape(BARYCENTER_SIZE).tolist() barycenter += ["mean", "mean", gram] barycenters.append(barycenter) for family in genealogy: ## create family mean barycenter # print("family", family, gram) data = [row[:5] for row in X if row[-2] == family and row[-3] == gram] data = [[float(item) for item in series] for series in data] data = [[el - sum(row) / len(row) for el in row] for row in data] if data != []: barycenter = dtw_barycenter_averaging( X=data, barycenter_size=BARYCENTER_SIZE,
print("split no. %d" % split) X = [] Y = [] with open("../../ValSurprisals/" + args.corpus + '/' + args.gram + '/' + \ args.language + "_training" + str(split) + ".csv", 'r') as f: reader = csv.reader(f) for row in reader: X.append(row[1:]) with open("../../ValSurprisals/" + args.corpus + '/' + args.gram + '/' + \ args.language + "_test" + str(split) + ".csv", 'r') as f: reader = csv.reader(f) for row in reader: Y.append(row[1:]) # get barycenter for each size hyperparameter value as list for BARYCENTER_SIZE in range(1, 16): X = [[float(item) for item in series if item != "NA"] for series in X] Y = [[float(item) for item in series if item != "NA"] for series in Y] barycenter = dtw_barycenter_averaging(X = X, barycenter_size = BARYCENTER_SIZE) total_dtw_dist = 0 for element in Y: total_dtw_dist += dtw(barycenter, element) compression_costs[split-1, BARYCENTER_SIZE-1] = total_dtw_dist np.savetxt("costs/" + args.corpus + '/' + args.gram + '/' + \ args.language + "_dtw_dists.csv", compression_costs, delimiter=",")
# plot all points of the data set for series in X: plt.plot(series.ravel(), "k-", alpha=.2) # plot the given barycenter of them plt.plot(barycenter.ravel(), "r-", linewidth=2) # plot the four variants with the same number of iterations and a tolerance of # 1e-3 where applicable ax1 = plt.subplot(4, 1, 1) plt.title("Euclidean barycenter") plot_helper(euclidean_barycenter(X)) plt.subplot(4, 1, 2, sharex=ax1) plt.title("DBA (vectorized version of Petitjean's EM)") plot_helper(dtw_barycenter_averaging(X, max_iter=50, tol=1e-3)) plt.subplot(4, 1, 3, sharex=ax1) plt.title("DBA (subgradient descent approach)") plot_helper(dtw_barycenter_averaging_subgradient(X, max_iter=50, tol=1e-3)) plt.subplot(4, 1, 4, sharex=ax1) plt.title("Soft-DTW barycenter ($\gamma$=1.0)") plot_helper(softdtw_barycenter(X, gamma=1., max_iter=50, tol=1e-3)) # clip the axes for better readability ax1.set_xlim([0, length_of_sequence]) # show the plot(s) plt.tight_layout() plt.show()
def dtw_avg(class_x): class_avg = [list() for i in range(len(class_x))] for c in range(len(class_x)): class_avg[c] = dtw_barycenter_averaging(class_x[c], max_iter=100) class_avg = np.array(class_avg) return class_avg