예제 #1
0
def test_serialize_global_alignment_kernel_kmeans():
    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    X = rng.randn(n, sz, d)

    gak_km = KernelKMeans(n_clusters=3, verbose=False,
                          max_iter=5)

    _check_not_fitted(gak_km)

    gak_km.fit(X)

    _check_params_predict(gak_km, X, ['predict'])
예제 #2
0
def test_variable_length_clustering():
    # TODO: here we just check that they can accept variable-length TS, not
    # that they do clever things
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9],
                                [3, 5, 6, 7, 8]])
    rng = np.random.RandomState(0)

    clf = KernelKMeans(n_clusters=2, random_state=rng)
    clf.fit(X)

    clf = TimeSeriesKMeans(n_clusters=2, metric="dtw", random_state=rng)
    clf.fit(X)

    clf = TimeSeriesKMeans(n_clusters=2, metric="softdtw", random_state=rng)
    clf.fit(X)
예제 #3
0
def test_kernel_kmeans():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)

    gak_km = KernelKMeans(n_clusters=3,
                          verbose=False,
                          max_iter=5,
                          random_state=rng).fit(time_series)
    np.testing.assert_allclose(gak_km.labels_, gak_km.predict(time_series))

    gak_km = KernelKMeans(n_clusters=101,
                          verbose=False,
                          max_iter=5,
                          random_state=rng).fit(time_series)
    assert gak_km._X_fit is None

    gak_km = KernelKMeans(n_clusters=2,
                          verbose=False,
                          kernel="rbf",
                          kernel_params={
                              "gamma": 1.
                          },
                          max_iter=5,
                          random_state=rng).fit(time_series)
    assert gak_km.sigma_gak_ is None
예제 #4
0
 def single_clustering(self, data_raw, data_new, centroid_num, model):
     """
     单次聚类
     :return:
     """
     seed = 0
     np.random.seed(seed)
     labels = []
     inertia = []
     centers = []
     if model == 'K-Means':
         kmeans = KMeans(n_clusters=centroid_num).fit(data_new)
         labels = kmeans.labels_
         centers = kmeans.cluster_centers_
         # 每个点到其簇的质心的距离之和,越小越好
         inertia = kmeans.inertia_
     elif model == 'DTW':
         sdtw_km = TimeSeriesKMeans(n_clusters=centroid_num,
                                    metric='softdtw',
                                    max_iter=2,
                                    max_iter_barycenter=2,
                                    metric_params={
                                        "gamma": 1.0
                                    },
                                    random_state=0,
                                    verbose=True).fit(data_new)
         labels = sdtw_km.labels_
         centers = sdtw_km.cluster_centers_
         inertia = sdtw_km.inertia_
     elif model == "K-Shape":
         ks = KShape(n_clusters=centroid_num,
                     verbose=True,
                     random_state=seed).fit(data_new)
         labels = ks.labels_
         centers = ks.cluster_centers_
         inertia = ks.inertia_
     elif model == "Kernel-KMeans":
         data_new = data_new[:100]
         data_raw = data_raw[:100]
         kk = KernelKMeans(n_clusters=centroid_num,
                           kernel="gak",
                           kernel_params={
                               "sigma": "auto"
                           },
                           max_iter=2,
                           tol=1e-4,
                           verbose=True).fit(data_new)
         labels = kk.labels_
         inertia = kk.inertia_
     D = {}
     for i in range(centroid_num):
         D[i] = []
     for i in range(len(data_raw)):
         D[labels[i]].append(data_raw[i])
     return inertia, D, centers
예제 #5
0
def cluster(num_domains, diff_kernel, min_seg_size, seg_numdomians_ratio,
            distance_matrix, min_domain_size, clustering_method, alpha_helices,
            max_alpha_helix_size_to_merge):
    try:

        if clustering_method == 'spectral':
            clustering = SpectralClustering(n_clusters=num_domains,
                                            assign_labels="kmeans",
                                            random_state=0,
                                            affinity='precomputed',
                                            n_init=100).fit(diff_kernel)
        elif clustering_method == 'kernel-kmeans':
            clustering = KernelKMeans(n_clusters=num_domains,
                                      random_state=0,
                                      n_init=100,
                                      kernel='precomputed').fit(diff_kernel)

        labels = clustering.labels_.copy()

        for alpha_helix in alpha_helices:
            if alpha_helix[1] - alpha_helix[
                    0] + 1 <= max_alpha_helix_size_to_merge:
                alpha_helix_labels = labels[alpha_helix[0]:alpha_helix[1] + 1]
                counter = collections.Counter(alpha_helix_labels)
                if len(counter) > 1:
                    most_common = counter.most_common(1)[0][0]
                    labels[alpha_helix[0]:alpha_helix[1] +
                           1] = [most_common
                                 ] * (alpha_helix[1] - alpha_helix[0] + 1)

        remove_short_segments(labels, min_seg_size, distance_matrix)

        remove_redundant_segments(labels, num_domains, seg_numdomians_ratio,
                                  distance_matrix)

        if (len(set(labels)) < num_domains):
            return 'error'

        sil_score = silhouette_score(distance_matrix,
                                     labels=labels,
                                     metric="precomputed")

        for label in set(labels):
            if np.count_nonzero(labels == label) < min_domain_size:
                return 'error'

        return labels, labels, sil_score

    except:
        return 'error'
예제 #6
0
    def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        from tslearn.clustering import KernelKMeans as TsLearnKernelKMeans

        verbose = 0
        if self.verbose is True:
            verbose = 1

        if self._tslearn_kernel_k_means is None:
            self._tslearn_kernel_k_means = TsLearnKernelKMeans(
                n_clusters=self.n_clusters,
                kernel=self.kernel,
                max_iter=self.max_iter,
                tol=self.tol,
                n_init=self.n_init,
                kernel_params=self.kernel_params,
                n_jobs=self.n_jobs,
                verbose=verbose,
                random_state=self.random_state,
            )
        self._tslearn_kernel_k_means.fit(X)
        self.labels_ = self._tslearn_kernel_k_means.labels_
        self.inertia_ = self._tslearn_kernel_k_means.inertia_
        self.n_iter_ = self._tslearn_kernel_k_means.n_iter_
예제 #7
0
def main(argv):
    # define global timer to obtain global execution time
    start_global = timer()
    
    # define globals variables
    global euclidean_clustered_data, \
        dtw_clustered_data, \
        soft_dtw_clustered_data, \
        k_shape_clustered_data, \
        gak_clustered_data
    
    #############################################################################################
    # Input arguments parsing
    #############################################################################################
    
    # define help message
    help_message = \
        'clustering.py -h \n\n' \
        'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \
        'by default: processing input data (without any sampling)' \
        '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \
        'options list: \n' \
        '  -c / --clusters <number_clusters>  # set number of clusters (default 3) \n\n' \
        '  -i / --ifile <input_file>          # set input filename \n' \
        '  -n / --normalise                   # normalise input data \n' \
        '  -s / --standardise                 # standardise input data \n\n' \
        '  -a / --all                         # perform all 5 implemented methods of clustering: \n' \
        '                                       euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \
        '  -E / --euclidean                   # perform euclidean k-means clustering \n' \
        '  -D / --dtw                         # perform dtw k-means clustering \n' \
        '  -S / --soft-dtw                    # perform soft-dtw k-means clustering \n' \
        '  -K / --k-shape                     # perform k-shape clustering \n' \
        '  -G / --gak                         # perform GAK k-means clustering \n'
    
    # Create new object to save arguments
    i_args = Arguments()
    
    # number of rows in plot to create correct number of subplots
    # default = 3 (raw data plus distribution histograms)
    n_rows_plot = 3
    
    # define validation rules for arguments
    try:
        opts, args = getopt.getopt(
            argv,
            "hc:i:nsaEDSKG",
            [
                "help",
                "clusters=",
                "ifile=",
                "normalise",
                "standardise",
                "all",
                "euclidean",
                "dtw",
                "soft-dtw",
                "k-shape",
                "gak"
            ]
        )
    except getopt.GetoptError:
        print(help_message)
        sys.exit(2)
    
    # parse arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print(help_message)
            sys.exit()
        elif opt in ("-c", "--clusters"):
            i_args.number_clusters = arg
        elif opt in ("-i", "--ifile"):
            i_args.input_file = arg
        elif opt in ("-n", "--normalise"):
            i_args.normalise_data = True
        elif opt in ("-s", "--standardise"):
            i_args.standardise_data = True
        elif opt in ("-E", "--euclidean"):
            n_rows_plot += 1
            i_args.euclidean_clustering = True
        elif opt in ("-D", "--dtw"):
            n_rows_plot += 1
            i_args.dtw_clustering = True
        elif opt in ("-S", "--soft-dtw"):
            n_rows_plot += 1
            i_args.soft_dtw_clustering = True
        elif opt in ("-K", "--k-shape"):
            n_rows_plot += 1
            i_args.k_shape_clustering = True
        elif opt in ("-G", "--gak"):
            n_rows_plot += 1
            i_args.gak_clustering = True
        elif opt in ("-a", "--all"):
            n_rows_plot = 8
            i_args.euclidean_clustering = True
            i_args.dtw_clustering = True
            i_args.soft_dtw_clustering = True
            i_args.k_shape_clustering = True
            i_args.gak_clustering = True
    
    # normalise maximum number of subplots levels
    n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot
    
    #############################################################################################
    # Raw data processing stage
    #############################################################################################
    
    # set style to matplotlib plot
    mpl.style.use('seaborn')
    
    # set seed value and seed the generator
    seed = 0
    numpy.random.seed(seed)
    
    # import data and print first 5 rows
    raw_data = import_data()
    print(raw_data.head())
    
    # convert raw data to the format which can be used by tslearn
    # (3-d dimensional array)
    # BUILT functionality: adjust all time series to one size
    # (NaN values are appended to the shorter ones)
    formatted_data = to_time_series_dataset(raw_data)
    
    # print shape of new array
    print(formatted_data.shape)
    
    # obtain number of measuring
    n_measuring = formatted_data.shape[1]
    
    # define figure, grid_spec to create layout of the plot
    fig = plt.figure(constrained_layout=True)
    grid_spec = fig.add_gridspec(
        n_rows_plot,
        i_args.number_clusters
    )
    
    # set A4 size to figure
    fig.set_size_inches(8.5, 11.75)
    
    # setup count of layers of subplots
    count_layer = 3
    # setup first subplot and draw raw time series
    f_ax_raw_data = fig.add_subplot(grid_spec[:2, :])
    
    for xx in formatted_data:
        f_ax_raw_data.plot(xx.ravel(), alpha=.2)
    
    formatted_data_min = formatted_data.min()
    formatted_data_max = formatted_data.max()
    # draw title for chart with min and max values
    f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max))

    # obtain and print executing time of data processing stage to console,
    timer_tick = get_time_tick(start_global)
    plt.ion()
    plt.show()
    
    print("Raw data processing time: %s" % timer_tick)
    
    #############################################################################################
    # Data preprocessing stage
    #############################################################################################
    
    start = timer()
    
    # Convert NaNs to value predicted by interpolation
    # linearly interpolate for NaN/NaNs
    n_nan_changes = 0
    for ind in range(formatted_data.shape[0]):
        mask = numpy.isnan(formatted_data[ind])
        n_nan_changes += mask.sum()
        formatted_data[ind][mask] = numpy.interp(
            numpy.flatnonzero(mask),
            numpy.flatnonzero(~mask),
            formatted_data[ind][~mask]
        )
    print("%d NaN values was/were interpolated" % n_nan_changes)
    
    # Scaling
    # to know should we use normalization or standardization, we need to see
    # the distribution of values.
    
    # take random 3 measuring for each case to draw histograms
    random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False)
    
    # create new arrays with values of randomly chosen measurements
    histogram_data = formatted_data[:, random_indexes]
    
    # draw histograms
    for i_histogram in range(i_args.number_clusters):
        f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram])
        f_ax_histogram.hist(
            histogram_data[:, i_histogram],
            bins=25, density=True
        )
        
        f_ax_histogram.text(0.55, 0.98,
                            'Measurement #%d' % random_indexes[i_histogram],
                            transform=plt.gca().transAxes,
                            color="navy"
                            )
        if i_histogram == 1:
            preprocessing = ''
            if i_args.normalise_data:
                preprocessing += "normalised"
                if i_args.standardise_data:
                    preprocessing += " and standardised"
            elif i_args.standardise_data:
                preprocessing += "standardised"

            preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing
            f_ax_histogram.set_title(
                "Distributions histograms %s" % preprocessing,
                color='navy', y=1, pad=14
            )
    
    # if no processing data option chosen continue with raw data
    processed_data = formatted_data
    
    # since for this concrete challenge data the distributions are more/less
    # Gaussian/Normal we can use standardization
    
    # normalize data: Min-Max scaling ranging between 0 and 1
    if i_args.normalise_data:
        processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data)
        print("Data was normalised")
    
    # standardize data: scaling technique where the values are centered around
    # the mean with a unit standard deviation
    if i_args.standardise_data:
        processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data)
        print("Data was standardised")
    
    # obtain max value of data (to be used in visualization subplots)
    max_data = processed_data.max() * 1.2
    min_data = processed_data.min() * 1.2
    
    timer_tick = get_time_tick(start)
    print("#############################################################################################")
    print("Data processing stage elapsed time: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Euclidean k-means clustering algorithm
    #############################################################################################
    
    if i_args.euclidean_clustering:
        
        start = timer()
        print("Euclidean k-means")
        
        # define parameters of the model of the algorithm
        k_means_euclidean = TimeSeriesKMeans(
            n_clusters=i_args.number_clusters,
            verbose=True,
            random_state=seed,
            n_jobs=4
        )
        
        # calculate cluster's label array
        euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data)
        
        # draw subplots with attributed clusters of time series as well as
        # cluster centers' lines
        for i_cluster in range(i_args.number_clusters):
            f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                n_measuring, min_data, max_data,
                                                processed_data, euclidean_clustered_data, 'tab:blue')
            
            f_ax_euclidean.plot(
                k_means_euclidean.cluster_centers_[i_cluster].ravel(),
                "tab:green"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_euclidean
        
        # increment count of filled layer of subplots
        count_layer += 1
        
        # obtain processing time, print it to console and
        # add it to the title of the series of subplots
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Euclidean $k$-means (%s)" % timer_tick,
            color='tab:green', y=1, pad=14
        )
        print("#############################################################################################")
        print("Euclidean k-means time processing: %s" % timer_tick)
        
    #############################################################################################
    # Implementing DTW k-means clustering algorithm
    # use dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.dtw_clustering:
        
        start = timer()
        print("DTW k-means")
        k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                       n_init=3,
                                       metric="dtw",
                                       verbose=True,
                                       max_iter_barycenter=10,
                                       random_state=seed,
                                       n_jobs=6
                                       )
        dtw_clustered_data = k_means_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                          n_measuring, min_data, max_data,
                                          processed_data, dtw_clustered_data, 'tab:blue')
            
            f_ax_dtw.plot(
                k_means_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:red"
            )
            if i_cluster == 1:
                middle_axis = f_ax_dtw

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "DTW $k$-means (%s)" % timer_tick,
            color='tab:red', y=1, pad=14
        )
        print("#############################################################################################")
        print("DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing soft DTW k-means clustering algorithm
    # use soft dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.soft_dtw_clustering:
        
        start = timer()
        print("Soft-DTW k-means")
        k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                            metric="softdtw",
                                            metric_params={"gamma": .025},
                                            verbose=True,
                                            random_state=seed,
                                            n_jobs=6
                                            )
        soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                               n_measuring, min_data, max_data,
                                               processed_data, soft_dtw_clustered_data, 'tab:blue')
            
            f_ax_soft_dtw.plot(
                k_means_soft_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:purple"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_soft_dtw

        # increment count of filled layer of subplots
        count_layer += 1

        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Soft-DTW $k$-means (%s)" % timer_tick,
            color='tab:purple', y=1, pad=14
        )
        print("#############################################################################################")
        print("Soft-DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing k-Shape clustering algorithm
    #############################################################################################
    
    if i_args.k_shape_clustering:
        
        start = timer()
        print("K-Shape")
        k_shape = KShape(n_clusters=i_args.number_clusters,
                         verbose=True,
                         random_state=seed
                         )
        k_shape_clustered_data = k_shape.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            
            min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min())
            max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max())
            
            f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                              n_measuring, min_axe_value, max_axe_value,
                                              processed_data, k_shape_clustered_data, 'tab:blue')
            
            f_ax_k_shape.plot(
                k_shape.cluster_centers_[i_cluster].ravel(),
                "tab:orange"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_k_shape

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "$K$-Shape (%s)" % timer_tick,
            color='tab:orange', y=1, pad=14
        )
        print("#############################################################################################")
        print("K-Shape time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Global Alignment kernel k-means clustering algorithm
    # since kernel is used, there is no centroid of the cluster
    #############################################################################################
    
    if i_args.gak_clustering:
        
        start = timer()
        print("GAK-k-means")
        gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters,
                                   kernel="gak",
                                   kernel_params={"sigma": "auto"},
                                   n_init=10,
                                   verbose=True,
                                   random_state=seed,
                                   n_jobs=6
                                   )
        
        gak_clustered_data = gak_k_means.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                  n_measuring, min_data, max_data,
                                                  processed_data, gak_clustered_data, 'tab:blue')
            
            if i_cluster == 1:
                middle_axis = f_ax_gak_k_means

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Global Alignment kernel $k$-means (%s)" % timer_tick,
            color='tab:cyan', y=1, pad=14)
        print("#############################################################################################")
        print("GAK k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    
    # return string with current datetime
    now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

    # define the name of the directory to be created
    path = "./out/%s" % now

    print("#############################################################################################")
    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
    else:
        print("Successfully created the directory %s " % path)
    
    try:
        # save figure as pdf to out folder
        fig.savefig("./out/%s/visual_result.pdf" % now)
    
        # save clustering results
        if i_args.euclidean_clustering:
            numpy.savetxt(
                "./out/%s/euclidean_clustering_result.csv" % now,
                euclidean_clustered_data,
                delimiter=","
            )
        if i_args.dtw_clustering:
            numpy.savetxt(
                "./out/%s/dtw_clustering_result.csv" % now,
                dtw_clustered_data,
                delimiter=","
            )
        if i_args.soft_dtw_clustering:
            numpy.savetxt(
                "./out/%s/soft_dtw_clustering_result.csv" % now,
                soft_dtw_clustered_data,
                delimiter=","
            )
        if i_args.k_shape_clustering:
            numpy.savetxt(
                "./out/%s/k_shape_clustering_result.csv" % now,
                k_shape_clustered_data,
                delimiter=","
            )
        if i_args.gak_clustering:
            numpy.savetxt(
                "./out/%s/gak_clustering_result.csv" % now,
                gak_clustered_data,
                delimiter=","
            )
    except RuntimeError:
        print("Saving results failed")
    else:
        print("Successfully saved results in the path %s " % path)

    #############################################################################################
    
    # obtain and print global executing time
    timer_tick = get_time_tick(start_global)
    print("#############################################################################################")
    print("All algorithms elapsed time: % s" % timer_tick)
    
    #############################################################################################

    # render and show plot
    # plt.show()
    plt.draw()
    plt.pause(0.001)
    input("Press [enter] to finish.")
    print("#############################################################################################")
예제 #8
0
true_clusters_known = pd.read_pickle('data/known_true_clusters_ids.pkl')
all_clusters_data = []
cl = 'Tree'
# min_size = 1280
# for
# for ev in true_clusters_known[cl].dropna():
#     e = Event(ev, 0, -1, 'resampled').data.shape[0]
#     if min_size > e:
#         min_size = e
# print(e)

for ev in true_clusters_known[cl].dropna():
    e = Event(ev, 0, -1, 'resampled')
    selected_data = e.res().loc[:, e.data.columns != 'Time (s)']
    all_clusters_data.append(selected_data)

#%%
seed = 0
formatted_dataset = to_time_series_dataset(all_clusters_data)
formatted_dataset[np.isnan(formatted_dataset)] = 0
#%%
X_train = TimeSeriesScalerMeanVariance().fit_transform(formatted_dataset)
gak_km = KernelKMeans(n_clusters=3,
                      kernel="gak",
                      kernel_params={"sigma": "auto"},
                      n_init=20,
                      verbose=True,
                      random_state=seed)
y_pred = gak_km.fit_predict(X_train)
#%%
예제 #9
0
class TimeSeriesKernelKMeans(BaseClusterer):
    """Kernel algorithm wrapper tslearns implementation.

    Parameters
    ----------
    n_clusters: int, defaults = 8
        The number of clusters to form as well as the number of
        centroids to generate.
    kernel : string, or callable (default: "gak")
        The kernel should either be "gak", in which case the Global Alignment
        Kernel from [2]_ is used or a value that is accepted as a metric
        by `scikit-learn's pairwise_kernels
        <https://scikit-learn.org/stable/modules/generated/\
        sklearn.metrics.pairwise.pairwise_kernels.html>`_
    n_init: int, defaults = 10
        Number of times the k-means algorithm will be run with different
        centroid seeds. The final result will be the best output of n_init
        consecutive runs in terms of inertia.
    kernel_params : dict or None (default: None)
        Kernel parameters to be passed to the kernel function.
        None means no kernel parameter is set.
        For Global Alignment Kernel, the only parameter of interest is `sigma`.
        If set to 'auto', it is computed based on a sampling of the training
        set
        (cf :ref:`tslearn.metrics.sigma_gak <fun-tslearn.metrics.sigma_gak>`).
        If no specific value is set for `sigma`, its defaults to 1.
    max_iter: int, defaults = 300
        Maximum number of iterations of the k-means algorithm for a single
        run.
    tol: float, defaults = 1e-4
        Relative tolerance with regards to Frobenius norm of the difference
        in the cluster centers of two consecutive iterations to declare
        convergence.
    verbose: bool, defaults = False
        Verbosity mode.
    n_jobs : int or None, optional (default=None)
        The number of jobs to run in parallel for GAK cross-similarity matrix
        computations.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See scikit-learns'
        `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
        for more details.
    random_state: int or np.random.RandomState instance or None, defaults = None
        Determines random number generation for centroid initialization.

    Attributes
    ----------
    labels_: np.ndarray (1d array of shape (n_instance,))
        Labels that is the index each time series belongs to.
    inertia_: float
        Sum of squared distances of samples to their closest cluster center, weighted by
        the sample weights if provided.
    n_iter_: int
        Number of iterations run.
    """

    _tags = {
        "capability:multivariate": True,
    }

    def __init__(
        self,
        n_clusters: int = 8,
        kernel: str = "gak",
        n_init: int = 10,
        max_iter: int = 300,
        tol: float = 1e-4,
        kernel_params: Union[dict, None] = None,
        verbose: bool = False,
        n_jobs: Union[int, None] = None,
        random_state: Union[int, RandomState] = None,
    ):
        _check_soft_dependencies("tslearn", severity="error", object=self)

        self.kernel = kernel
        self.n_init = n_init
        self.max_iter = max_iter
        self.tol = tol
        self.kernel_params = kernel_params
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.cluster_centers_ = None
        self.labels_ = None
        self.inertia_ = None
        self.n_iter_ = 0

        self._tslearn_kernel_k_means = None

        super(TimeSeriesKernelKMeans, self).__init__(n_clusters=n_clusters)

    def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        from tslearn.clustering import KernelKMeans as TsLearnKernelKMeans

        verbose = 0
        if self.verbose is True:
            verbose = 1

        if self._tslearn_kernel_k_means is None:
            self._tslearn_kernel_k_means = TsLearnKernelKMeans(
                n_clusters=self.n_clusters,
                kernel=self.kernel,
                max_iter=self.max_iter,
                tol=self.tol,
                n_init=self.n_init,
                kernel_params=self.kernel_params,
                n_jobs=self.n_jobs,
                verbose=verbose,
                random_state=self.random_state,
            )
        self._tslearn_kernel_k_means.fit(X)
        self.labels_ = self._tslearn_kernel_k_means.labels_
        self.inertia_ = self._tslearn_kernel_k_means.inertia_
        self.n_iter_ = self._tslearn_kernel_k_means.n_iter_

    def _predict(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Predict the closest cluster each sample in X belongs to.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Time series instances to predict their cluster indexes.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        np.ndarray (1d array of shape (n_instances,))
            Index of the cluster each time series in X belongs to.
        """
        return self._tslearn_kernel_k_means.predict(X)

    @classmethod
    def get_test_params(cls, parameter_set="default"):
        """Return testing parameter settings for the estimator.

        Parameters
        ----------
        parameter_set : str, default="default"
            Name of the set of test parameters to return, for use in tests. If no
            special parameters are defined for a value, will return `"default"` set.


        Returns
        -------
        params : dict or list of dict, default = {}
            Parameters to create testing instances of the class
            Each dict are parameters to construct an "interesting" test instance, i.e.,
            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
            `create_test_instance` uses the first (or only) dictionary in `params`
        """
        params = {
            "n_clusters": 2,
            "kernel": "gak",
            "n_init": 1,
            "max_iter": 1,
            "tol": 1e-4,
            "kernel_params": None,
            "verbose": False,
            "n_jobs": 1,
            "random_state": 1,
        }
        return params

    def _score(self, X, y=None):
        return np.abs(self.inertia_)