예제 #1
0
class KMeans():
    def __init__(self,
                 n_clusters,
                 algorithm='GlobalAlignmentKernelKMeans',
                 random_seed=0):
        '''
            initialize KMeans clustering model with specific kernel

            hyperparameters:
                n_clusters:         number of clusters in Kmeans model
                algorithm:          which kernel to use for model, options 
                                    are 'GlobalAlignmentKernelKMeans' and 'TimeSeriesKMeans'
                random_seed:        random seed with which to initialize Kmeans
        '''
        try:
            assert algorithm == 'GlobalAlignmentKernelKMeans' or algorithm == 'TimeSeriesKMeans'
        except:
            raise ValueError(
                "algorithm must be one of \'GlobalAlignmentKernelKMeans\' or \'TimeSeriesKMeans\'"
            )
        self.n_clusters = n_clusters
        self.random_seed = random_seed
        self.algorithm = algorithm
        self.km = None

    def fit(self, train):
        '''
            fit KMeans clustering model on training data

            parameters:
                train                : training time series
        '''

        if self.algorithm == 'TimeSeriesKMeans':
            self.km = TimeSeriesKMeans(n_clusters=self.n_clusters,
                                       n_init=20,
                                       verbose=True,
                                       random_state=self.random_seed)
        else:
            self.km = GlobalAlignmentKernelKMeans(
                n_clusters=self.n_clusters,
                sigma=sigma_gak(train),
                n_init=20,
                verbose=True,
                random_state=self.random_seed)
        return self.km.fit_predict(train)

    def predict(self, test):
        '''
            clusters for time series in test data set

            parameters:
                test:     test time series on which to predict clusters

            returns: clusters for test data set
        '''
        return self.km.predict(test)
예제 #2
0
    def fit(self, train):
        '''
            fit KMeans clustering model on training data

            parameters:
                train                : training time series
        ''' 

        if self.algorithm == 'TimeSeriesKMeans':
            self.km = TimeSeriesKMeans(n_clusters=self.n_clusters, n_init=20, verbose=True, random_state=self.random_seed)
        else:
            self.km = GlobalAlignmentKernelKMeans(n_clusters=self.n_clusters, sigma=sigma_gak(train), n_init=20, verbose=True, random_state=self.random_seed)
        self.km.fit(train)
예제 #3
0
def test_serialize_global_alignment_kernel_kmeans():
    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    X = rng.randn(n, sz, d)

    gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, verbose=False,
                                         max_iter=5)

    _check_not_fitted(gak_km)

    gak_km.fit(X)

    _check_params_predict(gak_km, X, ['predict'])
예제 #4
0
def test_variable_length_clustering():
    # TODO: here we just check that they can accept variable-length TS, not
    # that they do clever things
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9],
                                [3, 5, 6, 7, 8]])
    rng = np.random.RandomState(0)

    clf = GlobalAlignmentKernelKMeans(n_clusters=2, random_state=rng)
    clf.fit(X)

    clf = TimeSeriesKMeans(n_clusters=2, metric="dtw", random_state=rng)
    clf.fit(X)

    clf = TimeSeriesKMeans(n_clusters=2, metric="softdtw", random_state=rng)
    clf.fit(X)
예제 #5
0
def test_gak_kmeans():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)

    gak_km = GlobalAlignmentKernelKMeans(n_clusters=3,
                                         verbose=False,
                                         max_iter=5,
                                         random_state=rng).fit(time_series)
    np.testing.assert_allclose(gak_km.labels_, gak_km.predict(time_series))

    gak_km = GlobalAlignmentKernelKMeans(n_clusters=101,
                                         verbose=False,
                                         max_iter=5,
                                         random_state=rng).fit(time_series)
    assert gak_km._X_fit is None
    def do_gak(days, km_size):
        """
        From a time series (as a list of df called days), creates km_size 
        clusters using GAK algo.
        """
        # Arrange data for our lib
        unq = days["n_day_"].unique()
        values = [days[days["n_day_"] == l]["val_"].values for l in unq]
        formatted_dataset = to_time_series_dataset(values)

        # Configure our kmeans
        gak = GlobalAlignmentKernelKMeans(n_clusters=km_size,
                                          verbose=False,
                                          random_state=42)

        y_pred = gak.fit_predict(formatted_dataset)

        return gak, y_pred
예제 #7
0
def Kernel_K_Means(img_path, sp_met='slic', num_cuts=3):
    
    m_img = imread(img_path)
        
    # superpixels method
    if sp_met == 'felzenszwalb':
        segments = felzenszwalb(m_img, scale=10, sigma=0.5, min_size=100)
    elif sp_met == 'slic':
        segments = slic(m_img, compactness=30, n_segments=400)
    else:
        warnings.warn("Warning Message: no superpixels method parameter")
        
    # image -> eigenvectors
    g = graph.rag_mean_color(m_img, segments)
    w = nx.to_scipy_sparse_matrix(g, format='csc')
    entries = w.sum(axis=0)
    d = sparse.dia_matrix((entries, 0), shape=w.shape).tocsc()
    m = w.shape[0]
    d2 = d.copy()
    d2.data = np.reciprocal(np.sqrt(d2.data, out=d2.data), out=d2.data)
    matrix = d2 * (d - w) * d2

    # matrix eigen-decomposition, scipy.sparse.linalg
    vals, vectors = scipy.sparse.linalg.eigsh(matrix, which='SM', k=min(100, m - 2))
    vals, vectors = np.real(vals), np.real(vectors)
    
    # get first K eigenvectors
    
#     index1, index2, index3 = np.argsort(vals)[0], np.argsort(vals)[1], np.argsort(vals)[2]
#     ev1, ev, ev3 = vectors[:, index1], vectors[:, index2], vectors[:, index3]
    index = np.argsort(vals)[1]
    X_train = vectors[:, index]
    
    # Kernel K-means
    gak_km = GlobalAlignmentKernelKMeans(n_clusters=num_cuts, sigma=sigma_gak(X_train), n_init=20, \
                                         verbose=True, random_state=seed)
    sp_label = gak_km.fit_predict(X_train)
    
    # get pixel label
    p_label, labels = pixels_label(m_img, segments, sp_label)

    return p_label
예제 #8
0
    def fit(self, train):
        """
        fit KMeans clustering model on training data

        parameters:
            train                : training time series
        """

        if self.algorithm == "TimeSeriesKMeans":
            self.km = TimeSeriesKMeans(
                n_clusters=self.n_clusters,
                n_init=20,
                verbose=True,
                random_state=self.random_seed,
            )
        else:
            self.km = GlobalAlignmentKernelKMeans(
                n_clusters=self.n_clusters,
                sigma=sigma_gak(train),
                n_init=20,
                verbose=True,
                random_state=self.random_seed,
            )
        return self.km.fit_predict(train)
예제 #9
0
from tslearn.clustering import GlobalAlignmentKernelKMeans
from tslearn.metrics import sigma_gak, cdist_gak
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

# tslearn:
seed = 0                                 #물리적데이터를 직접 넣을 수 없으니까, '양자화'하여서 촘촘하게하면 정밀=양이 많아짐. 듬성듬성=양 줄어듬,정확도 떨어짐.
np.random.seed(seed)                     #타임시리즈 데이터, 이산적데이터 ..
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train <4]
np.random.shuffle(X_train)

#          TimeSeriesScalerMeanVariance의 매개변수가 mu(평균)와 std(표준편차) => 정규분포 정규화.
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50])
sz = X_train.shape[1]
gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, sigma=sigma_gak(X_train),    #왜 GlobalAlignmentKernelKMeans를 붙였을까요? 시계열 데이터를 범윅값을 가지고 전 범위 비교. 시계열 데이터를 잴때 정확히 재려면 뭐를 쓴다고 했죠? DTW(Dynamic Time Wraping)을 써서 거리값을 재서 쓴다고 했죠?
        n_init=20, verbose=True, random_state=seed)     #매개변술로 sigma가 들어가고 있어요. 왜 시그마가 들어갔죠? 시그마는 표준편차. X_train이 가지고있는 sigma gak
        #n_init는 클러스터의 중심값을 몇번 바꾸냐는거에요. verbose는 조용히 하라고 하는 것이다.
y_pred = gak_km.fit_predict(X_train)
plt.figure()
for yi in range(3):
    plt.subplot(3, 1, 1+ yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()

plt.show()
예제 #10
0
hum_sub = np.loadtxt('../../HUM_subs.csv', delimiter=',', skiprows=1)
print(hum_sub.shape)

X = to_time_series_dataset(hum_sub)
print(X.shape)
X = TimeSeriesScalerMeanVariance().fit_transform(X)
sz = X.shape[1]

seed = 0
np.random.seed(seed)

nclust = 4
gak_km = GlobalAlignmentKernelKMeans(n_clusters=nclust,
                                     sigma=sigma_gak(X),
                                     n_init=20,
                                     verbose=True,
                                     random_state=seed)
y_pred = gak_km.fit_predict(X)

print(gak_km.inertia_)
print(y_pred + 1)

plt.figure()
for yi in range(nclust):
    plt.subplot(nclust, 1, 1 + yi)
    for xx in X[y_pred == yi]:
        plt.plot(xx.ravel(), "k-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))