def _check_full_length(centroids): """Check that provided centroids are full-length (ie. not padded with nans). If some centroids are found to be padded with nans, TimeSeriesResampler is used to resample the centroids. """ resampler = TimeSeriesResampler(sz=centroids.shape[1]) return resampler.fit_transform(centroids)
def tslearn_format_export(self, other_data=None): """ Export la variable data vers le format utilise par tslearn pour la partitionnements Parameters: NA Returns:: NA """ df = [] dn = [] if self.ss.days: size_max = 170 else: size_max = 750 if other_data != None: data_dict = other_data else: data_dict = self.ss.get_data() for k, v in data_dict.items(): if not self.check_equal(v["Valeur"].values): if len(v["Valeur"].values) > self.size_min and len( v["Valeur"].values) < size_max: df.append(v["Valeur"].values) dn.append(k) self.capteurs_names.append(k) df_set = to_time_series_dataset(df) if self.sampler != 0: df_set = TimeSeriesResampler(self.sampler).fit_transform(df_set) self.ts = df_set self.ts_name = dn
def fit(self, X): self._X_fit = to_time_series_dataset(X) self.weights = _set_weights(self.weights, self._X_fit.shape[0]) if self.barycenter_ is None: if check_equal_size(self._X_fit): self.barycenter_ = EuclideanBarycenter.fit( self, self._X_fit) else: resampled_X = TimeSeriesResampler( sz=self._X_fit.shape[1]).fit_transform(self._X_fit) self.barycenter_ = EuclideanBarycenter.fit( self, resampled_X) if self.max_iter > 0: # The function works with vectors so we need to vectorize # barycenter_. res = minimize(self._func, self.barycenter_.ravel(), method=self.method, jac=True, tol=self.tol, options=dict(maxiter=self.max_iter, disp=False)) return res.x.reshape(self.barycenter_.shape) else: return self.barycenter_
def load_tslearn_data(): """ Time series data with variable length """ X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train < 4] # Keep first 3 classes np.random.shuffle(X_train) X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50]) # Keep only 50 time series X_train = TimeSeriesResampler(sz=40).fit_transform(X_train) # Make time series shorter X_train = X_train.reshape(50,-1) return X_train
def Preprocess(self, x=None): """ dataを(batch, len(data)//time_span)の形に整形する。 """ if str(type(x)) == "<class 'NoneType'>": self.n_data = len(self.data) // self.time_span self.n_use = self.time_span * self.n_data ts = self.data.loc[:self.data.index[self.n_use - 1]] ts = np.array(ts.TEMPERATURE).reshape(1, -1) ts = TimeSeriesScalerMeanVariance().fit_transform(ts) ts = np.array(ts).reshape(self.n_data, -1) ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts) self.ts = ts else: self.x_data = len(x) // self.time_span self.x_use = self.time_span * self.x_data ts = x.loc[:x.index[self.x_use - 1]] ts = np.array(ts.TEMPERATURE).reshape(1, -1) ts = TimeSeriesScalerMeanVariance().fit_transform(ts) ts = np.array(ts).reshape(self.x_data, -1) ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts) return ts
def softdtw_barycenter(X, gamma=1.0, weights=None, method="L-BFGS-B", tol=1e-3, max_iter=50, init=None): """Compute barycenter (time series averaging) under the soft-DTW geometry. Parameters ---------- X : array-like, shape=(n_ts, sz, d) Time series dataset. gamma: float Regularization parameter. Lower is less smoothed (closer to true DTW). weights: None or array Weights of each X[i]. Must be the same size as len(X). method: string Optimization method, passed to `scipy.optimize.minimize`. Default: L-BFGS. tol: float Tolerance of the method used. max_iter: int Maximum number of iterations. Examples -------- >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]] >>> euc_bar = euclidean_barycenter(time_series) >>> stdw_bar = softdtw_barycenter(time_series, max_iter=0) >>> stdw_bar.shape (4, 1) >>> numpy.alltrue(numpy.abs(euc_bar - stdw_bar) < 1e-9) # Because 0 iterations were performed True >>> softdtw_barycenter(time_series, max_iter=5).shape (4, 1) """ X_ = to_time_series_dataset(X) weights = _set_weights(weights, X_.shape[0]) if init is None: if check_equal_size(X_): barycenter = euclidean_barycenter(X_, weights) else: resampled_X = TimeSeriesResampler(sz=X_.shape[1]).fit_transform(X_) barycenter = euclidean_barycenter(resampled_X, weights) else: barycenter = init if max_iter > 0: f = lambda Z: _softdtw_func(Z, X_, weights, barycenter, gamma) # The function works with vectors so we need to vectorize barycenter. res = minimize(f, barycenter.ravel(), method=method, jac=True, tol=tol, options=dict(maxiter=max_iter, disp=False)) return res.x.reshape(barycenter.shape) else: return barycenter
def _fit_one_init(self, X, x_squared_norms, rs): n_ts, sz, d = time_series_dataset_shape(X) if check_equal_size(X): X_ = to_equal_sized_dataset(X) else: X_ = TimeSeriesResampler(sz=sz).fit_transform(X) self.cluster_centers_ = _k_init(X_.reshape( (n_ts, -1)), self.n_clusters, x_squared_norms, rs).reshape( (-1, sz, d)) old_inertia = numpy.inf for it in range(self.max_iter): self._assign(X) if self.verbose: print("%.3f" % self.inertia_, end=" --> ") self._update_centroids(X) if numpy.abs(old_inertia - self.inertia_) < self.tol: break old_inertia = self.inertia_ if self.verbose: print("") return self
def read_tsdata(rootpath, str1, str2): pdata = [] labelsave = {} label = [] # 读取数据标签并存储 for root, dirs, file in os.walk(rootpath): for files in file: if files.find(str2) >= 0: labelfile = open(rootpath + files) for line in labelfile: labelstr = line.split(',') labelsave[labelstr[0]] = labelstr[1].replace('\n', '') labelfile.close() # 读取数据,重采样至固定维数,并存储 for root, dirs, file in os.walk(rootpath): for files in file: if files.find(str1) >= 0: print(rootpath + files) a = np.loadtxt(rootpath + files) x1 = a[:, 1] x1 = smooth(x1) _range = np.max(x1) - np.min(x1) x1 = (x1 - np.min(x1)) / _range x1 = TimeSeriesResampler(sz=300).fit_transform(x1) x1 = x1.ravel() ax = plt.gca() ax.invert_yaxis() plt.plot(x1) plt.show() pdata.append(x1) label.append(labelsave[files]) # sam = reduce(operator.add, sam) return pdata, label
def predict(self, x): ts = self.Preprocess(x=x) pred = self.km.predict(ts) cluster = [] for i in range(self.x_data): list_item = [pred[i]] * self.time_span cluster.extend(list_item) #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。 if not self.x_use == len(x): self.x_c = x.loc[x.index[self.x_use]:] self.x_c = np.array(self.x_c.TEMPERATURE).reshape(1, -1) self.x_batch = TimeSeriesResampler(sz=self.batch).fit_transform( self.x_c) y_pred_c = [int(self.km.predict(self.x_batch))] * self.x_c.shape[1] cluster.extend(y_pred_c) x["CLUSTER"] = cluster self.draw_graph(x=x)
def run_time_series_kmeans(ts_data: pd.DataFrame, labels: list, sample: int) -> pd.DataFrame: # drop our geo cols features_df = ts_data.drop(columns=labels) geo_labels_df = ts_data.filter(labels) # tslearn TimeSeriesKMeans cluster ts_km = TimeSeriesKMeans(n_clusters=5, n_init=1, metric='dtw', max_iter=5, max_iter_barycenter=5, dtw_inertia=True) # re-sample by year features_resampled = TimeSeriesResampler(sz=sample).fit_transform(features_df.values) dtw_predict = ts_km.fit_predict(features_resampled) # DTW predictions df dtw_predict_df = pd.DataFrame(dtw_predict, columns=['dtw_cluster_prediction']) # merge cluster prediction to geo data geo_clusters_df = geo_labels_df.merge(dtw_predict_df, how='outer', left_index=True, right_index=True) geo_clusters_df = geo_clusters_df.astype({'dtw_cluster_prediction': float}) return geo_clusters_df
def classification(self): """ KShape で分類する。 使わなかったデータは、TimeSeriesResampler でかさ増しして使う 分類後に、self.data にcluster 列を作る """ self.Preprocess() self.y_pred = self.km.fit_predict(self.ts) #cluster 列を作る self.cluster = [] for i in range(self.n_data): list_item = [self.y_pred[i]] * self.time_span self.cluster.extend(list_item) #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。 if not self.n_use == len(self.data): self.ts_c = self.data.loc[self.data.index[self.n_use]:] self.ts_c = np.array(self.ts_c.TEMPERATURE).reshape(1, -1) self.ts_batch = TimeSeriesResampler(sz=self.batch).fit_transform( self.ts_c) self.y_pred_c = [int(self.km.predict(self.ts_batch)) ] * self.ts_c.shape[1] self.cluster.extend(self.y_pred_c) self.data["CLUSTER"] = self.cluster
print(ts_list) # formatted_time_series = to_time_series(df['氨氮']) # formatted_dataset = to_time_series_dataset([df['time'], df['氨氮']]) # print(formatted_time_series.shape) # print(formatted_dataset.shape) seed = 0 np.random.seed(seed) my_first_time_series = [1, 3, 4, 2] my_second_time_series = [1, 2, 3, 4] my_third_time_series = [4, 3, 2, 1] my_forth_time_series = [2, 6, 8, 9, 20] # formatted_dataset = to_time_series_dataset([my_first_time_series, my_second_time_series, my_third_time_series, my_forth_time_series]) formatted_dataset = to_time_series_dataset(ts_list) X_train = formatted_dataset X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train) X_train = TimeSeriesResampler(sz=80).fit_transform(X_train) sz = X_train.shape[1] best_score = 0.0 best_n_cluster = None best_y_pred = None best_cluster_centers_ = None for i in np.arange(2, 7): sdtw_km = TimeSeriesKMeans(n_clusters=i, metric="softdtw", metric_params={"gamma": .01}, verbose=True, random_state=seed) y_pred = sdtw_km.fit_predict(X_train) score = ss(X_train, sdtw_km.labels_, metric='softdtw') if score > best_score: best_score = score
optimizer=simple_adam, metrics=['accuracy']) # 训练模型 b_size = 2 max_epochs = 50 print("Starting training ") h = model.fit(np.array(data), np.array(label), batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1) print("Training finished \n") # 随意写个时间序列测试一下 unknown = np.array( [[0.1, 0.2, 0.5, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]], dtype=np.float32) # unknown = np.array([[0.1,0.2,0.3,0.4,0.5,0.6,0.6,0.6,0.6,0.5,0.4,0.3,0.2,0.1,0.1]], dtype=np.float32) # 重采样到300维,并把格式整理一下,以便能够输入 un = TimeSeriesResampler(sz=300).fit_transform(unknown) un = un.ravel() s = [] s.append(un) # 预测 predicted = model.predict(np.array(s)) print(predicted)
def exec_ts_resampler(X, size): data = TimeSeriesResampler(sz=size).fit_transform(X) return data
def _init_avg(self, X): if X[0].shape[0] == self.barycenter_size and check_equal_size(X): return X.mean(axis=0) else: X_ = TimeSeriesResampler(sz=self.barycenter_size).fit_transform(X) return X_.mean(axis=0)
def pca_show(origin_data, labels, num_cluster): #시계열 셋 길이 통일 min = origin_data.dropna(axis='columns') min_len = len(min.columns) #시계열셋 최소 길이 리스트형태로 담음 # min_lens=[] # data_len=len(data) # for i in range(0,data_len): # min=len(data[i][0]) # for j in range(0,len(data[i])): # if len(data[i][j]) < min: # min = len(data[i][j]) # min_lens.append(min) #시계열 셋 길이 통일 # result_re=[] # for i in range(0,data_len): # result_ = TimeSeriesResampler(sz=min_lens[i]).fit_transform(data[i]) # result_=result_.reshape(len(data[i]),min_lens[i]) # result_re.append(result_) result_ = TimeSeriesResampler(sz=min_len).fit_transform(origin_data) result_ result_ = result_.reshape(result_.shape[0], min_len) result_norm = Standard(pd.DataFrame(result_)) #수치형 변수 정규화 # result_norm=[] # for i in range(0, data_len): # norm = StandardScaler().fit_transform(result_re[i]) # result_norm.append(norm) #주성분 분석 실시하기 pca = PCA(n_components=2) #PCA 객체 생성 (주성분 갯수 2개 생성) result_pca = pca.fit_transform(result_norm) #주성분 분석 실시하기 #PCA 객체 생성 (주성분 갯수 2개 생성) # pca = PCA(n_components=2) # result_pca=[] # for i in range(0,data_len): # pca_ = pca.fit_transform(result_norm[i]) # result_pca.append(pca_) data = [] data_outlier = [[]] for i in range(num_cluster): data.append([]) list_value = result_pca.tolist() for i in range(len(labels)): if labels[i] != -1: data[labels[i]].append(list_value[i]) else: data_outlier[0].append(list_value[i]) #그래프 그리기 fig = go.Figure() data_np = np.array(data, dtype=object) data_outlier_np = np.array(data_outlier, dtype=object) for i in range(0, num_cluster): fig.add_trace( go.Scatter(x=[dt[0] for dt in data_np[i]], y=[dt[1] for dt in data_np[i]], mode='markers', name='Cluster' + str(i + 1))) if data_outlier_np[0].shape[0] > 0: fig.add_trace( go.Scatter(x=[dt[0] for dt in data_outlier_np[0]], y=[dt[1] for dt in data_outlier_np[0]], mode='markers', name='Outlier')) graph = html.Div(style={}, children=[ html.Div(["2-DIM VISUALIZATION"], className='subtitle'), html.Div( [html.Div(dcc.Graph(id='pca_show', figure=fig))]) ]) return graph
def softdtw_barycenter(X, gamma=1.0, weights=None, method="L-BFGS-B", tol=1e-3, max_iter=50, init=None): """Compute barycenter (time series averaging) under the soft-DTW [1] geometry. Soft-DTW was originally presented in [1]_. Parameters ---------- X : array-like, shape=(n_ts, sz, d) Time series dataset. gamma: float Regularization parameter. Lower is less smoothed (closer to true DTW). weights: None or array Weights of each X[i]. Must be the same size as len(X). If None, uniform weights are used. method: string Optimization method, passed to `scipy.optimize.minimize`. Default: L-BFGS. tol: float Tolerance of the method used. max_iter: int Maximum number of iterations. init: array or None (default: None) Initial barycenter to start from for the optimization process. If `None`, euclidean barycenter is used as a starting point. Returns ------- numpy.array of shape (bsz, d) where `bsz` is the size of the `init` array \ if provided or `sz` otherwise Soft-DTW barycenter of the provided time series dataset. Examples -------- >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]] >>> softdtw_barycenter(time_series, max_iter=5) array([[1.25161574], [2.03821705], [3.5101956 ], [4.36140605]]) >>> time_series = [[1, 2, 3, 4], [1, 2, 3, 4, 5]] >>> softdtw_barycenter(time_series, max_iter=5) array([[1.21349933], [1.8932251 ], [2.67573269], [3.51057026], [4.33645802]]) References ---------- .. [1] M. Cuturi, M. Blondel "Soft-DTW: a Differentiable Loss Function for Time-Series," ICML 2017. """ X_ = to_time_series_dataset(X) weights = _set_weights(weights, X_.shape[0]) if init is None: if check_equal_size(X_): barycenter = euclidean_barycenter(X_, weights) else: resampled_X = TimeSeriesResampler(sz=X_.shape[1]).fit_transform(X_) barycenter = euclidean_barycenter(resampled_X, weights) else: barycenter = init if max_iter > 0: X_ = numpy.array([to_time_series(d, remove_nans=True) for d in X_]) def f(Z): return _softdtw_func(Z, X_, weights, barycenter, gamma) # The function works with vectors so we need to vectorize barycenter. res = minimize(f, barycenter.ravel(), method=method, jac=True, tol=tol, options=dict(maxiter=max_iter, disp=False)) return res.x.reshape(barycenter.shape) else: return barycenter
# License: BSD 3 clause import numpy import matplotlib.pyplot as plt from tslearn.clustering import TimeSeriesKMeans from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesResampler seed = 0 numpy.random.seed(seed) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train < 4] # Keep first 3 classes numpy.random.shuffle(X_train) X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50]) # Keep only 50 time series X_train = TimeSeriesResampler(sz=40).fit_transform(X_train) # Make time series shorter sz = X_train.shape[1] # Euclidean k-means print("Euclidean k-means") km = TimeSeriesKMeans(n_clusters=3, verbose=True, random_state=seed) y_pred = km.fit_predict(X_train) plt.figure() for yi in range(3): plt.subplot(3, 3, yi + 1) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-4, 4)
data_path = "./" out_path = "./" # data_path = "../data/" # out_path = "../outputs/" seed = 0 np.random.seed(seed) # df = pd.read_hdf("filtered_data.hdf5", key="zeal") xtrain = pickle.load(open(data_path + "training_data.pck","rb")) ytrain = pickle.load(open(data_path + "training_labels.pck","rb")) # x_train = TimeSeriesScalerMinMax().fit_transform(xtrain[:260]) #shapes comparison x_train = TimeSeriesScalerMeanVariance().fit_transform(xtrain[:500]) #variance comparison x_train = TimeSeriesResampler(sz=500).fit_transform(x_train) sz = x_train.shape[1] print("DBA k-means") dba_km = TimeSeriesKMeans(n_clusters=10, n_init=1, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed) y_pred = dba_km.fit_predict(x_train) plt.figure() for yi in range(10): plt.subplot(10, 1, yi+1)