def _update_centroids(self, X): if self.metric_params is None: metric_params = {} else: metric_params = self.metric_params.copy() if "gamma_sdtw" in metric_params.keys(): metric_params["gamma"] = metric_params["gamma_sdtw"] del metric_params["gamma_sdtw"] for k in range(self.n_clusters): if self.metric == "dtw": self.cluster_centers_[k] = dtw_barycenter_averaging( X=X[self.labels_ == k], barycenter_size=None, init_barycenter=self.cluster_centers_[k], metric_params=metric_params, verbose=False) elif self.metric == "softdtw": self.cluster_centers_[k] = softdtw_barycenter( X=X[self.labels_ == k], max_iter=self.max_iter_barycenter, init=self.cluster_centers_[k], **metric_params) else: self.cluster_centers_[k] = euclidean_barycenter( X=X[self.labels_ == k])
def create_model_number_of_people(files_path): if not os.path.exists(endfolder + 'number_of_people.pickle'): result = dict() #{aeroporto: [baricentro, stdevs], ...} files = glob.glob(files_path + "*.csv") for file in files: station = os.path.splitext(os.path.basename(file))[0] series = pd.read_csv(file, header=0, parse_dates=[0], index_col=0, squeeze=True) df_series = pd.DataFrame(series).sort_index().asfreq('15T', fill_value=0) start_date = datetime.datetime(2018, 10, 1) end_date = datetime.datetime(2018, 11, 1) final_ts = [] while start_date <= end_date: values_by_day = df_series[str(start_date)[:10]] arr = values_by_day.to_numpy() final_ts.append(arr) start_date = start_date + datetime.timedelta(days=1) barycenter = softdtw_barycenter(final_ts, max_iter=50, tol=1e-3) #array of 96 arrays. #Compute the stdev for each hour start_time = datetime.time(0, 0, 0) end_time = datetime.time(23, 45, 0) std_devs = [] while start_time < end_time: std = df_series.between_time( str(start_time), str(start_time))['count'].to_numpy().std() std_devs.append(std) start_time = (datetime.datetime.combine( datetime.date(1, 1, 1), start_time) + datetime.timedelta(minutes=15)).time() std = df_series.between_time( str(end_time), str(end_time))['count'].to_numpy().std() std_devs.append(std) result[station] = [ list(chain.from_iterable(barycenter.tolist())), std_devs ] with open(endfolder + 'number_of_people.pickle', 'wb') as handle: pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL) return result else: with open(endfolder + 'number_of_people.pickle', 'rb') as handle: result = pickle.load(handle) return result
def plotAllRepetitions(preprocessed_data,gamma): plt.figure() [plt.plot(i,linewidth=0.5,color='grey') for i in preprocessed_data] plt.plot(np.mean(preprocessed_data,axis=0),color='r',label='Arithmetic barycentre') plt.plot(softdtw_barycenter(preprocessed_data,gamma=gamma),color='b',label='Soft-DTW barycentre') plt.ylabel(r'') plt.xlabel(r'') plt.legend() plt.xlim([0,preprocessed_data.shape[1]])
def CalcCentroid(StackedDF,ClusterResults,gamma=1): """ Calculates Centroid :param StackedDF: np.array containing cycles stacked row wise :param ClusterResults: dataframe with index Clusters which has cluster number. indexing from 1 :return: Centroids 2D array """ print("Computing Centroids") #Compute centroids with Soft-DTW data=np.array(StackedDF) Centroids = [] for clust in range(len(ClusterResults['Clusters'].unique())): Centroids.append(softdtw_barycenter(data[ClusterResults['Clusters'] == clust+1],gamma=gamma, max_iter=5)) return Centroids
def _update_centroids(self, X): metric_params = self._get_metric_params() for k in range(self.n_clusters): if self.metric == "dtw": self.cluster_centers_[k] = dtw_barycenter_averaging( X=X[self.labels_ == k], barycenter_size=None, init_barycenter=self.cluster_centers_[k], metric_params=metric_params, verbose=False) elif self.metric == "softdtw": self.cluster_centers_[k] = softdtw_barycenter( X=X[self.labels_ == k], max_iter=self.max_iter_barycenter, init=self.cluster_centers_[k], **metric_params) else: self.cluster_centers_[k] = euclidean_barycenter( X=X[self.labels_ == k])
def barrycenters(xtrain, ytrain, labels, num_in_each_label): split = [] barrycenterss = [] barrycenters_all = [] for j in range(labels.size): split.clear() barrycenterss.clear() for i in range(xtrain.shape[0]): if (ytrain[i] == labels[j]): split.append(xtrain[i]) split_array = np.asarray(split, dtype=np.float64) # split_array.shape = (num_in_each_label[j], xtrain.shape[1]) barrycenter = softdtw_barycenter(split_array, gamma=1.0, weights=None, method='L-BFGS-B', tol=0.001, max_iter=50, init=None) barrycenterss.append(barrycenter) barrycenters_all += barrycenterss barrycenters_all_array = np.asarray(barrycenters_all, dtype=np.float64) barrycenters_all_array.shape = (labels.size, xtrain.shape[1]) return barrycenters_all_array
import numpy import matplotlib.pyplot as plt from tslearn.barycenters import euclidean_barycenter, dtw_barycenter_averaging, softdtw_barycenter from tslearn.datasets import CachedDatasets numpy.random.seed(0) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X = X_train[y_train == 2] plt.figure() plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 plt.subplot(3, 1, 1) for ts in X: plt.plot(ts.ravel(), "k-", alpha=.2) plt.plot(euclidean_barycenter(X).ravel(), "r-", linewidth=2) plt.title("算数平均序列求解") plt.subplot(3, 1, 2) sdtw_bar = softdtw_barycenter(X, gamma=1., max_iter=100) for ts in X: plt.plot(ts.ravel(), "k-", alpha=.2) plt.plot(sdtw_bar.ravel(), "r-", linewidth=2) plt.title("DBA平均序列求解") plt.tight_layout() plt.show()
def multi_clustering(self, numbers, model, seq_train, sax_flag): """ 首先确定最优的聚类个数,然后多次聚类,取DBI最小的数据进行保存 numbers:单次聚类循环的次数 :return: """ out = {} # 分类好的时间序列 out_cent = [] # 分类好的质心序列 min_inertia = float('inf') raw_data = pd.read_csv('start_point_' + str(self.start_point) + '\\train.csv') df_data = pd.DataFrame(raw_data) sil_score = [] print(len(raw_data)) # 随机取聚类数据 df_data = df_data.sample(frac=0.6, replace=True, random_state=0, axis=0) data_raw = df_data.iloc[:, 1:].values data_new = df_data.iloc[:, 1:self.seq_train + 1].values if sax_flag: data_new = self.sax(data_new) # 确定最优的聚类个数 # # example_data = data_new.sample(frac=0.3) # example_data = data_new # # min = 100000 # for i in range(2, 9): # print('第%d个:' % i) # kmeans = KMeans(n_clusters=i).fit(example_data) # lab = kmeans.labels_ # cent = kmeans.cluster_centers_ # # 初始化一个字典作为DBI的输入 # d = {} # for j in range(i): # d[j] = [] # for j in range(len(example_data)): # d[lab[j]].append(example_data[j]) # # x是聚好类的时间序列的输入矩阵,clusters是簇质心,nc是簇类的数量 # dbi = DBI.compute_DB_index(d, cent, i) # sil_score.append(dbi) # # if dbi < min: # # min = dbi # # self.labels = lab # # self.centers = cent # # file = open('....\DBI_elbow_Google.csv', 'a', newline='') # content = csv.writer(file) # content.writerow(sil_score) # file.close() # print(sil_score) # print(np.argmax(sil_score)) # self.Centroid = int(np.argmax(sil_score))+1 # 以最优聚类个数为参数,进行多次聚类,记录inertia最小的聚类结果 for i in range(numbers): # 记录聚类花费的时间 start = time.time() inertia, output, centers = self.single_clustering( data_raw, data_new, self.Centroid, model) end = time.time() print('聚类耗时:%.10f' % (end - start)) print(inertia) if inertia < min_inertia: min_inertia = inertia out = output.copy() print("Here:" + str(len(out))) out_cent = centers.copy() for i in range(len(out)): data = pd.DataFrame(out[i]) data.to_csv('start_point_' + str(self.start_point) + '\\class' + str(i) + '.csv') if model == 'DTW' or model == 'K-Shape': # out_cent = self.sax(out_cent) out_cent = pd.DataFrame( np.reshape(out_cent, (self.Centroid, seq_train))) elif model == 'K-Means': # out_cent = self.sax(out_cent) out_cent = pd.DataFrame(out_cent) elif model == 'Kernel-KMeans': for i in range(len(out)): out_cent.append( softdtw_barycenter(out[i], max_iter=5, tol=1e-3)) # print(out_cent) out_cent = pd.DataFrame( np.reshape(out_cent, (self.Centroid, seq_length))) # print(out_cent) out_cent.to_csv('start_point_' + str(self.start_point) + '\\centers.csv')
for series in X: plt.plot(series.ravel(), "k-", alpha=.2) # plot the given barycenter of them plt.plot(barycenter.ravel(), "r-", linewidth=2) # plot the four variants with the same number of iterations and a tolerance of # 1e-3 where applicable ax1 = plt.subplot(4, 1, 1) plt.title("Euclidean barycenter") plot_helper(euclidean_barycenter(X)) plt.subplot(4, 1, 2, sharex=ax1) plt.title("DBA (vectorized version of Petitjean's EM)") plot_helper(dtw_barycenter_averaging(X, max_iter=50, tol=1e-3)) plt.subplot(4, 1, 3, sharex=ax1) plt.title("DBA (subgradient descent approach)") plot_helper(dtw_barycenter_averaging_subgradient(X, max_iter=50, tol=1e-3)) plt.subplot(4, 1, 4, sharex=ax1) plt.title("Soft-DTW barycenter ($\gamma$=1.0)") plot_helper(softdtw_barycenter(X, gamma=1., max_iter=50, tol=1e-3)) # clip the axes for better readability ax1.set_xlim([0, length_of_sequence]) # show the plot(s) plt.tight_layout() plt.show()
plt.plot(X_out[i].ravel(), color=matplotlib.colors.rgb2hex(get_color(w)), linewidth=2) plt.text(X_out[i].shape[0], 0., "$X_%d$" % i, horizontalalignment="right", verticalalignment="baseline", fontsize=24) plt.xticks([]) plt.yticks([]) for pos in range(2, 25): if pos in [1, 5, 21, 25]: continue plt.subplot(5, 5, pos) idxr, idxc = row_col(pos, 5) w = numpy.array([0.] * 4) w[0] = (4 - idxr) * (4 - idxc) / 16 w[1] = (4 - idxr) * idxc / 16 w[2] = idxr * (4 - idxc) / 16 w[3] = idxr * idxc / 16 plt.plot(softdtw_barycenter(X=X_out, weights=w).ravel(), color=matplotlib.colors.rgb2hex(get_color(w)), linewidth=2) plt.xticks([]) plt.yticks([]) plt.tight_layout() plt.show()
def soft_dtw_avg(class_x): class_avg = [list() for i in range(len(class_x))] for c in range(len(class_x)): class_avg[c] = softdtw_barycenter(class_x[c], gamma=1., max_iter=100) class_avg = np.array(class_avg) return class_avg
def softdtw_augment_train_set(x_train, y_train, classes, num_synthetic_ts, max_neighbors=5): from tslearn.neighbors import KNeighborsTimeSeries from tslearn.barycenters import softdtw_barycenter from tslearn.metrics import gamma_soft_dtw # synthetic train set and labels synthetic_x_train = [] synthetic_y_train = [] # loop through each class for c in classes: # get the MTS for this class c_x_train = x_train[np.where(y_train == 0)[0]] if len(c_x_train) == 1: # skip if there is only one time series per set continue # compute appropriate gamma for softdtw for the entire class class_gamma = gamma_soft_dtw(c_x_train) # loop through the number of synthtectic examples needed generated_samples = 0 while generated_samples < num_synthetic_ts: # Choose a random representative for the class representative_indices = np.arange(len(c_x_train)) random_representative_index = np.random.choice( representative_indices, size=1, replace=False) random_representative = c_x_train[random_representative_index] # Choose a random number of neighbors (between 1 and one minus the total number of class representatives) random_number_of_neighbors = int( np.random.uniform(1, max_neighbors, size=1)) knn = KNeighborsTimeSeries(n_neighbors=random_number_of_neighbors + 1, metric='softdtw', metric_params={ 'gamma': class_gamma }).fit(c_x_train) random_neighbor_distances, random_neighbor_indices = knn.kneighbors( X=random_representative, return_distance=True) random_neighbor_indices = random_neighbor_indices[0] random_neighbor_distances = random_neighbor_distances[0] nearest_neighbor_distance = np.sort(random_neighbor_distances)[1] # random_neighbors = np.zeros((random_number_of_neighbors+1, c_x_train.shape[1]), dtype=float) random_neighbors = np.zeros( (random_number_of_neighbors + 1, c_x_train.shape[1], c_x_train.shape[2]), dtype=float) for j, neighbor_index in enumerate(random_neighbor_indices): random_neighbors[j, :] = c_x_train[neighbor_index] # Choose a random weight vector (and then normalize it) weights = np.exp( np.log(0.5) * random_neighbor_distances / nearest_neighbor_distance) weights /= np.sum(weights) # Compute tslearn.barycenters.softdtw_barycenter with weights=random weights and gamma value specific to neighbors random_neighbors_gamma = gamma_soft_dtw(random_neighbors) generated_sample = softdtw_barycenter(random_neighbors, weights=weights, gamma=random_neighbors_gamma) synthetic_x_train.append(generated_sample) synthetic_y_train.append(c) # Repeat until you have the desired number of synthetic samples for each class generated_samples += 1 # return the synthetic set return np.array(synthetic_x_train), np.array(synthetic_y_train)