Exemplo n.º 1
0
    def _precompute_cross_dist(self, X, other_X=None):
        if other_X is None:
            other_X = self._ts_fit

        self._ts_metric = self.metric
        self.metric = "precomputed"

        metric_params = self._get_metric_params()

        X = check_array(X, allow_nd=True, force_all_finite=False)
        X = to_time_series_dataset(X)

        if self._ts_metric == "dtw":
            X_ = cdist_dtw(X, other_X, n_jobs=self.n_jobs,
                           **metric_params)
        elif self._ts_metric == "ctw":
            X_ = cdist_ctw(X, other_X, **metric_params)
        elif self._ts_metric == "softdtw":
            X_ = cdist_soft_dtw(X, other_X, **metric_params)
        elif self._ts_metric == "sax":
            X = self._sax_preprocess(X, **metric_params)
            X_ = cdist_sax(X, self._sax.breakpoints_avg_,
                           self._sax._X_fit_dims_[1], other_X,
                           n_jobs=self.n_jobs)
        else:
            raise ValueError("Invalid metric recorded: %s" %
                             self._ts_metric)

        return X_
Exemplo n.º 2
0
 def _assign(self, X, update_class_attributes=True):
     if self.metric == "euclidean":
         dists = cdist(X.reshape((X.shape[0], -1)),
                       self.cluster_centers_.reshape((self.n_clusters, -1)),
                       metric="euclidean")
     elif self.metric == "dtw":
         dists = cdist_dtw(X, self.cluster_centers_)
     elif self.metric == "softdtw":
         dists = cdist_soft_dtw(X,
                                self.cluster_centers_,
                                gamma=self.gamma_sdtw)
     else:
         raise ValueError(
             "Incorrect metric: %s (should be one of 'dtw', 'softdtw', 'euclidean')"
             % self.metric)
     matched_labels = dists.argmin(axis=1)
     if update_class_attributes:
         self.labels_ = matched_labels
         _check_no_empty_cluster(self.labels_, self.n_clusters)
         if self.dtw_inertia and self.metric != "dtw":
             inertia_dists = cdist_dtw(X, self.cluster_centers_)
         else:
             inertia_dists = dists
         self.inertia_ = _compute_inertia(inertia_dists, self.labels_,
                                          self._squared_inertia)
     return matched_labels
    def predict_proba(self, X):
        """Predict the class probabilities for the provided data

        Parameters
        ----------
        X : array-like, shape (n_ts, sz, d)
            Test samples.

        Returns
        -------
        array, shape = (n_ts, n_classes)
            Array of predicted class probabilities
        """
        if self.metric in VARIABLE_LENGTH_METRICS:
            self._ts_metric = self.metric
            self.metric = "precomputed"

            if self.metric_params is None:
                metric_params = {}
            else:
                metric_params = self.metric_params.copy()
                if "n_jobs" in metric_params.keys():
                    del metric_params["n_jobs"]
                if "verbose" in metric_params.keys():
                    del metric_params["verbose"]
            check_is_fitted(self, '_ts_fit')
            X = check_array(X, allow_nd=True, force_all_finite=False)
            X = to_time_series_dataset(X)
            if self._ts_metric == "dtw":
                X_ = cdist_dtw(X,
                               self._ts_fit,
                               n_jobs=self.n_jobs,
                               verbose=self.verbose,
                               **metric_params)
            elif self._ts_metric == "softdtw":
                X_ = cdist_soft_dtw(X, self._ts_fit, **metric_params)
            else:
                raise ValueError("Invalid metric recorded: %s" %
                                 self._ts_metric)
            pred = super(KNeighborsTimeSeriesClassifier,
                         self).predict_proba(X_)
            self.metric = self._ts_metric
            return pred
        else:
            check_is_fitted(self, '_X_fit')
            X = check_array(X, allow_nd=True)
            X = to_time_series_dataset(X)
            X_ = to_sklearn_dataset(X)
            X_ = check_dims(X_, self._X_fit, extend=False)
            return super(KNeighborsTimeSeriesClassifier,
                         self).predict_proba(X_)
Exemplo n.º 4
0
 def _transform(self, X):
     metric_params = self._get_metric_params()
     if self.metric == "euclidean":
         return cdist(X.reshape((X.shape[0], -1)),
                       self.cluster_centers_.reshape((self.n_clusters, -1)),
                       metric="euclidean")
     elif self.metric == "dtw":
         return cdist_dtw(X, self.cluster_centers_, n_jobs=self.n_jobs,
                           verbose=self.verbose, **metric_params)
     elif self.metric == "softdtw":
         return cdist_soft_dtw(X, self.cluster_centers_, **metric_params)
     else:
         raise ValueError("Incorrect metric: %s (should be one of 'dtw', "
                          "'softdtw', 'euclidean')" % self.metric)
Exemplo n.º 5
0
 def _assign(self, X, update_class_attributes=True):
     if self.metric_params is None:
         metric_params = {}
     else:
         metric_params = self.metric_params.copy()
     if "gamma_sdtw" in metric_params.keys():
         metric_params["gamma"] = metric_params["gamma_sdtw"]
         del metric_params["gamma_sdtw"]
     if "n_jobs" in metric_params.keys():
         del metric_params["n_jobs"]
     if self.metric == "euclidean":
         dists = cdist(X.reshape((X.shape[0], -1)),
                       self.cluster_centers_.reshape((self.n_clusters, -1)),
                       metric="euclidean")
     elif self.metric == "dtw":
         dists = cdist_dtw(X,
                           self.cluster_centers_,
                           n_jobs=self.n_jobs,
                           verbose=self.verbose,
                           **metric_params)
     elif self.metric == "softdtw":
         dists = cdist_soft_dtw(X, self.cluster_centers_, **metric_params)
     else:
         raise ValueError("Incorrect metric: %s (should be one of 'dtw', "
                          "'softdtw', 'euclidean')" % self.metric)
     matched_labels = dists.argmin(axis=1)
     if update_class_attributes:
         self.labels_ = matched_labels
         _check_no_empty_cluster(self.labels_, self.n_clusters)
         if self.dtw_inertia and self.metric != "dtw":
             inertia_dists = cdist_dtw(X,
                                       self.cluster_centers_,
                                       n_jobs=self.n_jobs,
                                       verbose=self.verbose)
         else:
             inertia_dists = dists
         self.inertia_ = _compute_inertia(inertia_dists, self.labels_,
                                          self._squared_inertia)
     return matched_labels
Exemplo n.º 6
0
for i in range(0, 15):
    plt.plot(x, sample5density[i], 'k-', alpha=.2)
plt.plot(x,
         scaler_density_train.inverse_transform(density_5),
         'r-',
         label='density')
plt.xlabel('hours of the day')
plt.ylabel('density')
plt.title('k=4')
plt.legend()
plt.show()

#similarity between centroids of the clusters
from tslearn.metrics import soft_dtw, cdist_soft_dtw
similarity = []
matrix = cdist_soft_dtw(centroids, gamma=1.)
matrix
sim = matrix.max()
similarity.append(sim)

similarity = np.array(similarity)
diss = list(-similarity)

cluster = np.arange(2, 8)
plt.title('soft-DTW similarity measure S60')
plt.plot(cluster, diss)
plt.xlabel('n° of cluster')
plt.ylabel('similarity between closest clusters')
plt.show()

#visualization
    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
        """Finds the K-neighbors of a point.

        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : array-like, shape (n_ts, sz, d)
            The query time series.
            If not provided, neighbors of each indexed point are returned.
            In this case, the query point is not considered its own neighbor.
        n_neighbors : int
            Number of neighbors to get (default is the value passed to the
            constructor).
        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned

        Returns
        -------
        dist : array
            Array representing the distance to points, only present if
            return_distance=True
        ind : array
            Indices of the nearest points in the population matrix.
        """
        if self.metric in VARIABLE_LENGTH_METRICS:
            self._ts_metric = self.metric
            self.metric = "precomputed"

            if self.metric_params is None:
                metric_params = {}
            else:
                metric_params = self.metric_params.copy()
                if "n_jobs" in metric_params.keys():
                    del metric_params["n_jobs"]
                if "verbose" in metric_params.keys():
                    del metric_params["verbose"]
            check_is_fitted(self, '_ts_fit')
            X = check_array(X, allow_nd=True, force_all_finite=False)
            X = to_time_series_dataset(X)
            if self._ts_metric == "dtw":
                X_ = cdist_dtw(X,
                               self._ts_fit,
                               n_jobs=self.n_jobs,
                               verbose=self.verbose,
                               **metric_params)
            elif self._ts_metric == "softdtw":
                X_ = cdist_soft_dtw(X, self._ts_fit, **metric_params)
            else:
                raise ValueError("Invalid metric recorded: %s" %
                                 self._ts_metric)
            pred = KNeighborsTimeSeriesMixin.kneighbors(
                self,
                X=X_,
                n_neighbors=n_neighbors,
                return_distance=return_distance)
            self.metric = self._ts_metric
            return pred
        else:
            check_is_fitted(self, '_X_fit')
            if X is None:
                X_ = None
            else:
                X = check_array(X, allow_nd=True)
                X = to_time_series_dataset(X)
                X_ = to_sklearn_dataset(X)
                X_ = check_dims(X_, self._X_fit, extend=False)
            return KNeighborsTimeSeriesMixin.kneighbors(
                self,
                X=X_,
                n_neighbors=n_neighbors,
                return_distance=return_distance)
Exemplo n.º 8
0
def silhouette_score(X,
                     labels,
                     metric=None,
                     sample_size=None,
                     metric_params=None,
                     random_state=None,
                     **kwds):
    """Compute the mean Silhouette Coefficient of all samples (cf.  [1]_ and  [2]_).

    Read more in the `scikit-learn documentation
    <http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient>`_.

    Parameters
    ----------
    X : array [n_ts, n_ts] if metric == "precomputed", or, \
             [n_ts, sz, d] otherwise
        Array of pairwise distances between time series, or a time series dataset.
    labels : array, shape = [n_ts]
         Predicted labels for each time series.
    metric : string, or callable
        The metric to use when calculating distance between time series.
        Should be one of {'dtw', 'softdtw', 'euclidean'} or a callable distance
        function.
        If X is the distance array itself, use ``metric="precomputed"``.
    sample_size : int or None
        The size of the sample to use when computing the Silhouette Coefficient
        on a random subset of the data.
        If ``sample_size is None``, no sampling is used.
    metric_params : dict or None
        Parameter values for the chosen metric. Value associated to the `"gamma_sdtw"` key corresponds to the gamma
        parameter in Soft-DTW.
    random_state : int, RandomState instance or None, optional (default=None)
        The generator used to randomly select a subset of samples.  If int,
        random_state is the seed used by the random number generator; If
        RandomState instance, random_state is the random number generator; If
        None, the random number generator is the RandomState instance used by
        `np.random`. Used when ``sample_size is not None``.
    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function.
    Returns
    -------
    silhouette : float
        Mean Silhouette Coefficient for all samples.
    References
    ----------
    .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
       Interpretation and Validation of Cluster Analysis". Computational
       and Applied Mathematics 20: 53-65.
       <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_
    .. [2] `Wikipedia entry on the Silhouette Coefficient
           <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
    Examples
    --------
    >>> from tslearn.generators import random_walks
    >>> from tslearn.metrics import cdist_dtw
    >>> X = random_walks(n_ts=50, sz=32, d=1)
    >>> labels = numpy.random.randint(2, size=50)
    >>> s_sc = silhouette_score(X, labels, metric="dtw")
    >>> s_sc2 = silhouette_score(X, labels, metric="softdtw")
    >>> s_sc2b = silhouette_score(X, labels, metric="softdtw", metric_params={"gamma_sdtw": 2.})
    >>> s_sc3 = silhouette_score(cdist_dtw(X), labels, metric="precomputed")
    """
    sklearn_metric = None
    if metric_params is None:
        metric_params = {}
    if metric == "precomputed":
        sklearn_X = X
    elif metric == "dtw":
        sklearn_X = cdist_dtw(X)
    elif metric == "softdtw":
        gamma = metric_params.get("gamma_sdtw", None)
        if gamma is not None:
            sklearn_X = cdist_soft_dtw(X, gamma=gamma)
        else:
            sklearn_X = cdist_soft_dtw(X)
    elif metric == "euclidean":
        sklearn_X = cdist(X, X, metric="euclidean")
    else:
        X_ = to_time_series_dataset(X)
        n, sz, d = X_.shape
        sklearn_X = X_.reshape((n, -1))
        if metric is None:
            metric = dtw
        sklearn_metric = lambda x, y: metric(
            to_time_series(x.reshape((sz, d)), remove_nans=True),
            to_time_series(y.reshape((sz, d)), remove_nans=True))
    return sklearn_silhouette_score(
        X=sklearn_X,
        labels=labels,
        metric="precomputed" if sklearn_metric is None else sklearn_metric,
        sample_size=sample_size,
        random_state=random_state,
        **kwds)
Exemplo n.º 9
0
def test_kmeans():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)

    km = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5,
                          verbose=False, random_state=rng).fit(time_series)
    dists = cdist(time_series.reshape((n, -1)),
                  km.cluster_centers_.reshape((3, -1)))
    np.testing.assert_allclose(km.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km.labels_, km.predict(time_series))

    km_dba = TimeSeriesKMeans(n_clusters=3,
                              metric="dtw",
                              max_iter=5,
                              verbose=False,
                              random_state=rng).fit(time_series)
    dists = cdist_dtw(time_series, km_dba.cluster_centers_)
    np.testing.assert_allclose(km_dba.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km_dba.labels_, km_dba.predict(time_series))

    km_sdtw = TimeSeriesKMeans(n_clusters=3,
                               metric="softdtw",
                               max_iter=5,
                               verbose=False,
                               random_state=rng).fit(time_series)
    dists = cdist_soft_dtw(time_series, km_sdtw.cluster_centers_)
    np.testing.assert_allclose(km_sdtw.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km_sdtw.labels_, km_sdtw.predict(time_series))

    km_nofit = TimeSeriesKMeans(n_clusters=101,
                                verbose=False,
                                random_state=rng).fit(time_series)
    assert(km_nofit._X_fit is None)

    X_bis = to_time_series_dataset([[1, 2, 3, 4],
                                    [1, 2, 3],
                                    [2, 5, 6, 7, 8, 9]])
    TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5,
                     metric="softdtw", random_state=0).fit(X_bis)
    TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5,
                     metric="dtw", random_state=0,
                     init="random").fit(X_bis)
    TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5,
                     metric="dtw", random_state=0,
                     init="k-means++").fit(X_bis)
    TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5,
                     metric="dtw", init=X_bis[:2]).fit(X_bis)

    # Barycenter size (nb of timestamps)
    # Case 1. kmeans++ / random init
    n, sz, d = 15, 10, 1
    n_clusters = 3
    time_series = rng.randn(n, sz, d)

    sizes_all_same_series = [sz] * n_clusters
    km_euc = TimeSeriesKMeans(n_clusters=3,
                              metric="euclidean",
                              max_iter=5,
                              verbose=False,
                              init="k-means++",
                              random_state=rng).fit(time_series)
    np.testing.assert_equal(sizes_all_same_series,
                            [ts_size(b) for b in km_euc.cluster_centers_])
    km_dba = TimeSeriesKMeans(n_clusters=3,
                              metric="dtw",
                              max_iter=5,
                              verbose=False,
                              init="random",
                              random_state=rng).fit(time_series)
    np.testing.assert_equal(sizes_all_same_series,
                            [ts_size(b) for b in km_dba.cluster_centers_])

    # Case 2. forced init
    barys = to_time_series_dataset([[1., 2., 3.],
                                    [1., 2., 2., 3., 4.],
                                    [3., 2., 1.]])
    sizes_all_same_bary = [barys.shape[1]] * n_clusters
    # If Euclidean is used, barycenters size should be that of the input series
    km_euc = TimeSeriesKMeans(n_clusters=3,
                              metric="euclidean",
                              max_iter=5,
                              verbose=False,
                              init=barys,
                              random_state=rng)
    np.testing.assert_raises(ValueError, km_euc.fit, time_series)

    km_dba = TimeSeriesKMeans(n_clusters=3,
                              metric="dtw",
                              max_iter=5,
                              verbose=False,
                              init=barys,
                              random_state=rng).fit(time_series)
    np.testing.assert_equal(sizes_all_same_bary,
                            [ts_size(b) for b in km_dba.cluster_centers_])
    km_sdtw = TimeSeriesKMeans(n_clusters=3,
                               metric="softdtw",
                               max_iter=5,
                               verbose=False,
                               init=barys,
                               random_state=rng).fit(time_series)
    np.testing.assert_equal(sizes_all_same_bary,
                            [ts_size(b) for b in km_sdtw.cluster_centers_])

    # A simple dataset, can we extract the correct number of clusters?
    time_series = to_time_series_dataset([[1, 2, 3],
                                   [7, 8, 9, 11],
                                   [.1, .2, 2.],
                                   [1, 1, 1, 9],
                                   [10, 20, 30, 1000]])
    preds = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5,
                             random_state=rng).fit_predict(time_series)
    np.testing.assert_equal(set(preds), set(range(3)))
    preds = TimeSeriesKMeans(n_clusters=4, metric="dtw", max_iter=5,
                             random_state=rng).fit_predict(time_series)
    np.testing.assert_equal(set(preds), set(range(4)))
Exemplo n.º 10
0
 def metric_fun(x, y):
     return cdist_soft_dtw(x, y, **metric_params)
Exemplo n.º 11
0
def test_kmeans():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)

    km = TimeSeriesKMeans(n_clusters=3,
                          metric="euclidean",
                          max_iter=5,
                          verbose=False,
                          random_state=rng).fit(time_series)
    dists = cdist(time_series.reshape((n, -1)),
                  km.cluster_centers_.reshape((3, -1)))
    np.testing.assert_allclose(km.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km.labels_, km.predict(time_series))

    km_dba = TimeSeriesKMeans(n_clusters=3,
                              metric="dtw",
                              max_iter=5,
                              verbose=False,
                              random_state=rng).fit(time_series)
    dists = cdist_dtw(time_series, km_dba.cluster_centers_)
    np.testing.assert_allclose(km_dba.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km_dba.labels_, km_dba.predict(time_series))

    km_sdtw = TimeSeriesKMeans(n_clusters=3,
                               metric="softdtw",
                               max_iter=5,
                               verbose=False,
                               random_state=rng).fit(time_series)
    dists = cdist_soft_dtw(time_series, km_sdtw.cluster_centers_)
    np.testing.assert_allclose(km_sdtw.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km_sdtw.labels_, km_sdtw.predict(time_series))

    km_nofit = TimeSeriesKMeans(n_clusters=101,
                                verbose=False,
                                random_state=rng).fit(time_series)
    assert (km_nofit._X_fit is None)

    X_bis = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3],
                                    [2, 5, 6, 7, 8, 9]])
    TimeSeriesKMeans(n_clusters=2,
                     verbose=False,
                     max_iter=5,
                     metric="softdtw",
                     random_state=0).fit(X_bis)
    TimeSeriesKMeans(n_clusters=2,
                     verbose=False,
                     max_iter=5,
                     metric="dtw",
                     random_state=0,
                     init="random").fit(X_bis)
    TimeSeriesKMeans(n_clusters=2,
                     verbose=False,
                     max_iter=5,
                     metric="dtw",
                     random_state=0,
                     init="k-means++").fit(X_bis)
    TimeSeriesKMeans(n_clusters=2,
                     verbose=False,
                     max_iter=5,
                     metric="dtw",
                     init=X_bis[:2]).fit(X_bis)