Пример #1
0
def _check_full_length(centroids):
    """Check that provided centroids are full-length (ie. not padded with
    nans).

    If some centroids are found to be padded with nans, TimeSeriesResampler is
    used to resample the centroids.
    """
    resampler = TimeSeriesResampler(sz=centroids.shape[1])
    return resampler.fit_transform(centroids)
Пример #2
0
    def tslearn_format_export(self, other_data=None):
        """
        Export la variable data vers le format utilise par tslearn pour la partitionnements

        Parameters:
            NA

        Returns::
            NA
        """
        df = []
        dn = []
        if self.ss.days:
            size_max = 170
        else:
            size_max = 750
        if other_data != None:
            data_dict = other_data
        else:
            data_dict = self.ss.get_data()
        for k, v in data_dict.items():
            if not self.check_equal(v["Valeur"].values):
                if len(v["Valeur"].values) > self.size_min and len(
                        v["Valeur"].values) < size_max:
                    df.append(v["Valeur"].values)
                    dn.append(k)
                    self.capteurs_names.append(k)
        df_set = to_time_series_dataset(df)
        if self.sampler != 0:
            df_set = TimeSeriesResampler(self.sampler).fit_transform(df_set)
        self.ts = df_set
        self.ts_name = dn
Пример #3
0
        def fit(self, X):
            self._X_fit = to_time_series_dataset(X)
            self.weights = _set_weights(self.weights, self._X_fit.shape[0])
            if self.barycenter_ is None:
                if check_equal_size(self._X_fit):
                    self.barycenter_ = EuclideanBarycenter.fit(
                        self, self._X_fit)
                else:
                    resampled_X = TimeSeriesResampler(
                        sz=self._X_fit.shape[1]).fit_transform(self._X_fit)
                    self.barycenter_ = EuclideanBarycenter.fit(
                        self, resampled_X)

            if self.max_iter > 0:
                # The function works with vectors so we need to vectorize
                # barycenter_.
                res = minimize(self._func,
                               self.barycenter_.ravel(),
                               method=self.method,
                               jac=True,
                               tol=self.tol,
                               options=dict(maxiter=self.max_iter, disp=False))
                return res.x.reshape(self.barycenter_.shape)
            else:
                return self.barycenter_
Пример #4
0
def load_tslearn_data():
    """ Time series data with variable length """
    X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
    X_train = X_train[y_train < 4]  # Keep first 3 classes
    np.random.shuffle(X_train)
    X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50])  # Keep only 50 time series
    X_train = TimeSeriesResampler(sz=40).fit_transform(X_train)  # Make time series shorter
    X_train = X_train.reshape(50,-1)
    return X_train
Пример #5
0
 def Preprocess(self, x=None):
     """
     dataを(batch, len(data)//time_span)の形に整形する。
     """
     if str(type(x)) == "<class 'NoneType'>":
         self.n_data = len(self.data) // self.time_span
         self.n_use = self.time_span * self.n_data
         ts = self.data.loc[:self.data.index[self.n_use - 1]]
         ts = np.array(ts.TEMPERATURE).reshape(1, -1)
         ts = TimeSeriesScalerMeanVariance().fit_transform(ts)
         ts = np.array(ts).reshape(self.n_data, -1)
         ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts)
         self.ts = ts
     else:
         self.x_data = len(x) // self.time_span
         self.x_use = self.time_span * self.x_data
         ts = x.loc[:x.index[self.x_use - 1]]
         ts = np.array(ts.TEMPERATURE).reshape(1, -1)
         ts = TimeSeriesScalerMeanVariance().fit_transform(ts)
         ts = np.array(ts).reshape(self.x_data, -1)
         ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts)
         return ts
Пример #6
0
def softdtw_barycenter(X, gamma=1.0, weights=None, method="L-BFGS-B", tol=1e-3, max_iter=50, init=None):
    """Compute barycenter (time series averaging) under the soft-DTW geometry.

    Parameters
    ----------
    X : array-like, shape=(n_ts, sz, d)
        Time series dataset.
    gamma: float
        Regularization parameter.
        Lower is less smoothed (closer to true DTW).
    weights: None or array
        Weights of each X[i]. Must be the same size as len(X).
    method: string
        Optimization method, passed to `scipy.optimize.minimize`.
        Default: L-BFGS.
    tol: float
        Tolerance of the method used.
    max_iter: int
        Maximum number of iterations.

    Examples
    --------
    >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
    >>> euc_bar = euclidean_barycenter(time_series)
    >>> stdw_bar = softdtw_barycenter(time_series, max_iter=0)
    >>> stdw_bar.shape
    (4, 1)
    >>> numpy.alltrue(numpy.abs(euc_bar - stdw_bar) < 1e-9)  # Because 0 iterations were performed
    True
    >>> softdtw_barycenter(time_series, max_iter=5).shape
    (4, 1)
    """
    X_ = to_time_series_dataset(X)
    weights = _set_weights(weights, X_.shape[0])
    if init is None:
        if check_equal_size(X_):
            barycenter = euclidean_barycenter(X_, weights)
        else:
            resampled_X = TimeSeriesResampler(sz=X_.shape[1]).fit_transform(X_)
            barycenter = euclidean_barycenter(resampled_X, weights)
    else:
        barycenter = init

    if max_iter > 0:
        f = lambda Z: _softdtw_func(Z, X_, weights, barycenter, gamma)
        # The function works with vectors so we need to vectorize barycenter.
        res = minimize(f, barycenter.ravel(), method=method, jac=True, tol=tol,
                       options=dict(maxiter=max_iter, disp=False))
        return res.x.reshape(barycenter.shape)
    else:
        return barycenter
Пример #7
0
    def _fit_one_init(self, X, x_squared_norms, rs):
        n_ts, sz, d = time_series_dataset_shape(X)
        if check_equal_size(X):
            X_ = to_equal_sized_dataset(X)
        else:
            X_ = TimeSeriesResampler(sz=sz).fit_transform(X)
        self.cluster_centers_ = _k_init(X_.reshape(
            (n_ts, -1)), self.n_clusters, x_squared_norms, rs).reshape(
                (-1, sz, d))
        old_inertia = numpy.inf

        for it in range(self.max_iter):
            self._assign(X)
            if self.verbose:
                print("%.3f" % self.inertia_, end=" --> ")
            self._update_centroids(X)

            if numpy.abs(old_inertia - self.inertia_) < self.tol:
                break
            old_inertia = self.inertia_
        if self.verbose:
            print("")

        return self
Пример #8
0
def read_tsdata(rootpath, str1, str2):
    pdata = []
    labelsave = {}
    label = []
    # 读取数据标签并存储
    for root, dirs, file in os.walk(rootpath):
        for files in file:
            if files.find(str2) >= 0:
                labelfile = open(rootpath + files)
                for line in labelfile:
                    labelstr = line.split(',')
                    labelsave[labelstr[0]] = labelstr[1].replace('\n', '')
                labelfile.close()

    # 读取数据,重采样至固定维数,并存储
    for root, dirs, file in os.walk(rootpath):
        for files in file:
            if files.find(str1) >= 0:
                print(rootpath + files)
                a = np.loadtxt(rootpath + files)
                x1 = a[:, 1]
                x1 = smooth(x1)
                _range = np.max(x1) - np.min(x1)
                x1 = (x1 - np.min(x1)) / _range
                x1 = TimeSeriesResampler(sz=300).fit_transform(x1)
                x1 = x1.ravel()

                ax = plt.gca()
                ax.invert_yaxis()
                plt.plot(x1)
                plt.show()
                pdata.append(x1)
                label.append(labelsave[files])

    # sam = reduce(operator.add, sam)
    return pdata, label
Пример #9
0
 def predict(self, x):
     ts = self.Preprocess(x=x)
     pred = self.km.predict(ts)
     cluster = []
     for i in range(self.x_data):
         list_item = [pred[i]] * self.time_span
         cluster.extend(list_item)
     #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。
     if not self.x_use == len(x):
         self.x_c = x.loc[x.index[self.x_use]:]
         self.x_c = np.array(self.x_c.TEMPERATURE).reshape(1, -1)
         self.x_batch = TimeSeriesResampler(sz=self.batch).fit_transform(
             self.x_c)
         y_pred_c = [int(self.km.predict(self.x_batch))] * self.x_c.shape[1]
         cluster.extend(y_pred_c)
     x["CLUSTER"] = cluster
     self.draw_graph(x=x)
Пример #10
0
def run_time_series_kmeans(ts_data: pd.DataFrame, labels: list, sample: int) -> pd.DataFrame:
    # drop our geo cols
    features_df = ts_data.drop(columns=labels)
    geo_labels_df = ts_data.filter(labels)

    # tslearn TimeSeriesKMeans cluster
    ts_km = TimeSeriesKMeans(n_clusters=5, n_init=1, metric='dtw', max_iter=5, max_iter_barycenter=5, dtw_inertia=True)

    # re-sample by year
    features_resampled = TimeSeriesResampler(sz=sample).fit_transform(features_df.values)
    dtw_predict = ts_km.fit_predict(features_resampled)

    # DTW predictions df
    dtw_predict_df = pd.DataFrame(dtw_predict, columns=['dtw_cluster_prediction'])

    # merge cluster prediction to geo data
    geo_clusters_df = geo_labels_df.merge(dtw_predict_df, how='outer', left_index=True, right_index=True)
    geo_clusters_df = geo_clusters_df.astype({'dtw_cluster_prediction': float})
    return geo_clusters_df
Пример #11
0
 def classification(self):
     """
     KShape で分類する。
     使わなかったデータは、TimeSeriesResampler でかさ増しして使う
     分類後に、self.data にcluster 列を作る
     """
     self.Preprocess()
     self.y_pred = self.km.fit_predict(self.ts)
     #cluster 列を作る
     self.cluster = []
     for i in range(self.n_data):
         list_item = [self.y_pred[i]] * self.time_span
         self.cluster.extend(list_item)
     #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。
     if not self.n_use == len(self.data):
         self.ts_c = self.data.loc[self.data.index[self.n_use]:]
         self.ts_c = np.array(self.ts_c.TEMPERATURE).reshape(1, -1)
         self.ts_batch = TimeSeriesResampler(sz=self.batch).fit_transform(
             self.ts_c)
         self.y_pred_c = [int(self.km.predict(self.ts_batch))
                          ] * self.ts_c.shape[1]
         self.cluster.extend(self.y_pred_c)
     self.data["CLUSTER"] = self.cluster
Пример #12
0
print(ts_list)
# formatted_time_series = to_time_series(df['氨氮'])
# formatted_dataset = to_time_series_dataset([df['time'], df['氨氮']])
# print(formatted_time_series.shape)
# print(formatted_dataset.shape)
seed = 0
np.random.seed(seed)
my_first_time_series = [1, 3, 4, 2]
my_second_time_series = [1, 2, 3, 4]
my_third_time_series = [4, 3, 2, 1]
my_forth_time_series = [2, 6, 8, 9, 20]
# formatted_dataset = to_time_series_dataset([my_first_time_series, my_second_time_series, my_third_time_series, my_forth_time_series])
formatted_dataset = to_time_series_dataset(ts_list)
X_train = formatted_dataset
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train)
X_train = TimeSeriesResampler(sz=80).fit_transform(X_train)
sz = X_train.shape[1]
best_score = 0.0
best_n_cluster = None
best_y_pred = None
best_cluster_centers_ = None
for i in np.arange(2, 7):
    sdtw_km = TimeSeriesKMeans(n_clusters=i,
                               metric="softdtw",
                               metric_params={"gamma": .01},
                               verbose=True,
                               random_state=seed)
    y_pred = sdtw_km.fit_predict(X_train)
    score = ss(X_train, sdtw_km.labels_, metric='softdtw')
    if score > best_score:
        best_score = score
Пример #13
0
              optimizer=simple_adam,
              metrics=['accuracy'])

# 训练模型
b_size = 2
max_epochs = 50
print("Starting training ")
h = model.fit(np.array(data),
              np.array(label),
              batch_size=b_size,
              epochs=max_epochs,
              shuffle=True,
              verbose=1)
print("Training finished \n")

# 随意写个时间序列测试一下
unknown = np.array(
    [[0.1, 0.2, 0.5, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]],
    dtype=np.float32)
# unknown = np.array([[0.1,0.2,0.3,0.4,0.5,0.6,0.6,0.6,0.6,0.5,0.4,0.3,0.2,0.1,0.1]], dtype=np.float32)

# 重采样到300维,并把格式整理一下,以便能够输入
un = TimeSeriesResampler(sz=300).fit_transform(unknown)
un = un.ravel()
s = []
s.append(un)

# 预测
predicted = model.predict(np.array(s))
print(predicted)
Пример #14
0
def exec_ts_resampler(X, size):
    data = TimeSeriesResampler(sz=size).fit_transform(X)
    return data
Пример #15
0
 def _init_avg(self, X):
     if X[0].shape[0] == self.barycenter_size and check_equal_size(X):
         return X.mean(axis=0)
     else:
         X_ = TimeSeriesResampler(sz=self.barycenter_size).fit_transform(X)
         return X_.mean(axis=0)
Пример #16
0
def pca_show(origin_data, labels, num_cluster):
    #시계열 셋 길이 통일
    min = origin_data.dropna(axis='columns')
    min_len = len(min.columns)
    #시계열셋 최소 길이 리스트형태로 담음
    # min_lens=[]
    # data_len=len(data)
    # for i in range(0,data_len):
    #     min=len(data[i][0])
    #     for j in range(0,len(data[i])):
    #         if len(data[i][j]) < min:
    #             min = len(data[i][j])
    #     min_lens.append(min)

    #시계열 셋 길이 통일
    # result_re=[]
    # for i in range(0,data_len):
    #     result_ = TimeSeriesResampler(sz=min_lens[i]).fit_transform(data[i])
    #     result_=result_.reshape(len(data[i]),min_lens[i])
    #     result_re.append(result_)
    result_ = TimeSeriesResampler(sz=min_len).fit_transform(origin_data)
    result_
    result_ = result_.reshape(result_.shape[0], min_len)

    result_norm = Standard(pd.DataFrame(result_))
    #수치형 변수 정규화
    # result_norm=[]
    # for i in range(0, data_len):
    #     norm = StandardScaler().fit_transform(result_re[i])
    #     result_norm.append(norm)

    #주성분 분석 실시하기
    pca = PCA(n_components=2)  #PCA 객체 생성 (주성분 갯수 2개 생성)
    result_pca = pca.fit_transform(result_norm)
    #주성분 분석 실시하기
    #PCA 객체 생성 (주성분 갯수 2개 생성)
    # pca = PCA(n_components=2)
    # result_pca=[]
    # for i in range(0,data_len):
    #     pca_ = pca.fit_transform(result_norm[i])
    #     result_pca.append(pca_)
    data = []
    data_outlier = [[]]
    for i in range(num_cluster):
        data.append([])
    list_value = result_pca.tolist()
    for i in range(len(labels)):
        if labels[i] != -1:
            data[labels[i]].append(list_value[i])
        else:
            data_outlier[0].append(list_value[i])
    #그래프 그리기
    fig = go.Figure()
    data_np = np.array(data, dtype=object)
    data_outlier_np = np.array(data_outlier, dtype=object)

    for i in range(0, num_cluster):
        fig.add_trace(
            go.Scatter(x=[dt[0] for dt in data_np[i]],
                       y=[dt[1] for dt in data_np[i]],
                       mode='markers',
                       name='Cluster' + str(i + 1)))
    if data_outlier_np[0].shape[0] > 0:
        fig.add_trace(
            go.Scatter(x=[dt[0] for dt in data_outlier_np[0]],
                       y=[dt[1] for dt in data_outlier_np[0]],
                       mode='markers',
                       name='Outlier'))
    graph = html.Div(style={},
                     children=[
                         html.Div(["2-DIM VISUALIZATION"],
                                  className='subtitle'),
                         html.Div(
                             [html.Div(dcc.Graph(id='pca_show', figure=fig))])
                     ])

    return graph
Пример #17
0
def softdtw_barycenter(X,
                       gamma=1.0,
                       weights=None,
                       method="L-BFGS-B",
                       tol=1e-3,
                       max_iter=50,
                       init=None):
    """Compute barycenter (time series averaging) under the soft-DTW [1]
    geometry.

    Soft-DTW was originally presented in [1]_.

    Parameters
    ----------
    X : array-like, shape=(n_ts, sz, d)
        Time series dataset.
    gamma: float
        Regularization parameter.
        Lower is less smoothed (closer to true DTW).
    weights: None or array
        Weights of each X[i]. Must be the same size as len(X).
        If None, uniform weights are used.
    method: string
        Optimization method, passed to `scipy.optimize.minimize`.
        Default: L-BFGS.
    tol: float
        Tolerance of the method used.
    max_iter: int
        Maximum number of iterations.
    init: array or None (default: None)
        Initial barycenter to start from for the optimization process.
        If `None`, euclidean barycenter is used as a starting point.

    Returns
    -------
    numpy.array of shape (bsz, d) where `bsz` is the size of the `init` array \
            if provided or `sz` otherwise
        Soft-DTW barycenter of the provided time series dataset.

    Examples
    --------
    >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
    >>> softdtw_barycenter(time_series, max_iter=5)
    array([[1.25161574],
           [2.03821705],
           [3.5101956 ],
           [4.36140605]])
    >>> time_series = [[1, 2, 3, 4], [1, 2, 3, 4, 5]]
    >>> softdtw_barycenter(time_series, max_iter=5)
    array([[1.21349933],
           [1.8932251 ],
           [2.67573269],
           [3.51057026],
           [4.33645802]])

    References
    ----------
    .. [1] M. Cuturi, M. Blondel "Soft-DTW: a Differentiable Loss Function for
       Time-Series," ICML 2017.
    """
    X_ = to_time_series_dataset(X)
    weights = _set_weights(weights, X_.shape[0])
    if init is None:
        if check_equal_size(X_):
            barycenter = euclidean_barycenter(X_, weights)
        else:
            resampled_X = TimeSeriesResampler(sz=X_.shape[1]).fit_transform(X_)
            barycenter = euclidean_barycenter(resampled_X, weights)
    else:
        barycenter = init

    if max_iter > 0:
        X_ = numpy.array([to_time_series(d, remove_nans=True) for d in X_])

        def f(Z):
            return _softdtw_func(Z, X_, weights, barycenter, gamma)

        # The function works with vectors so we need to vectorize barycenter.
        res = minimize(f,
                       barycenter.ravel(),
                       method=method,
                       jac=True,
                       tol=tol,
                       options=dict(maxiter=max_iter, disp=False))
        return res.x.reshape(barycenter.shape)
    else:
        return barycenter
Пример #18
0
# License: BSD 3 clause

import numpy
import matplotlib.pyplot as plt

from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesResampler

seed = 0
numpy.random.seed(seed)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train < 4]  # Keep first 3 classes
numpy.random.shuffle(X_train)
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50])  # Keep only 50 time series
X_train = TimeSeriesResampler(sz=40).fit_transform(X_train)  # Make time series shorter
sz = X_train.shape[1]

# Euclidean k-means
print("Euclidean k-means")
km = TimeSeriesKMeans(n_clusters=3, verbose=True, random_state=seed)
y_pred = km.fit_predict(X_train)

plt.figure()
for yi in range(3):
    plt.subplot(3, 3, yi + 1)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
data_path = "./"
out_path = "./"

# data_path = "../data/"
# out_path = "../outputs/"

seed = 0
np.random.seed(seed)

# df = pd.read_hdf("filtered_data.hdf5", key="zeal")
xtrain = pickle.load(open(data_path + "training_data.pck","rb"))
ytrain = pickle.load(open(data_path + "training_labels.pck","rb"))

# x_train = TimeSeriesScalerMinMax().fit_transform(xtrain[:260]) #shapes comparison
x_train = TimeSeriesScalerMeanVariance().fit_transform(xtrain[:500]) #variance comparison
x_train = TimeSeriesResampler(sz=500).fit_transform(x_train)
sz = x_train.shape[1]

print("DBA k-means")
dba_km = TimeSeriesKMeans(n_clusters=10,
                          n_init=1,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10,
                          random_state=seed)

y_pred = dba_km.fit_predict(x_train)

plt.figure()
for yi in range(10):
    plt.subplot(10, 1, yi+1)