Exemplo n.º 1
0
class KMeans():
    def __init__(self,
                 n_clusters,
                 algorithm='GlobalAlignmentKernelKMeans',
                 random_seed=0):
        '''
            initialize KMeans clustering model with specific kernel

            hyperparameters:
                n_clusters:         number of clusters in Kmeans model
                algorithm:          which kernel to use for model, options 
                                    are 'GlobalAlignmentKernelKMeans' and 'TimeSeriesKMeans'
                random_seed:        random seed with which to initialize Kmeans
        '''
        try:
            assert algorithm == 'GlobalAlignmentKernelKMeans' or algorithm == 'TimeSeriesKMeans'
        except:
            raise ValueError(
                "algorithm must be one of \'GlobalAlignmentKernelKMeans\' or \'TimeSeriesKMeans\'"
            )
        self.n_clusters = n_clusters
        self.random_seed = random_seed
        self.algorithm = algorithm
        self.km = None

    def fit(self, train):
        '''
            fit KMeans clustering model on training data

            parameters:
                train                : training time series
        '''

        if self.algorithm == 'TimeSeriesKMeans':
            self.km = TimeSeriesKMeans(n_clusters=self.n_clusters,
                                       n_init=20,
                                       verbose=True,
                                       random_state=self.random_seed)
        else:
            self.km = GlobalAlignmentKernelKMeans(
                n_clusters=self.n_clusters,
                sigma=sigma_gak(train),
                n_init=20,
                verbose=True,
                random_state=self.random_seed)
        return self.km.fit_predict(train)

    def predict(self, test):
        '''
            clusters for time series in test data set

            parameters:
                test:     test time series on which to predict clusters

            returns: clusters for test data set
        '''
        return self.km.predict(test)
    def do_gak(days, km_size):
        """
        From a time series (as a list of df called days), creates km_size 
        clusters using GAK algo.
        """
        # Arrange data for our lib
        unq = days["n_day_"].unique()
        values = [days[days["n_day_"] == l]["val_"].values for l in unq]
        formatted_dataset = to_time_series_dataset(values)

        # Configure our kmeans
        gak = GlobalAlignmentKernelKMeans(n_clusters=km_size,
                                          verbose=False,
                                          random_state=42)

        y_pred = gak.fit_predict(formatted_dataset)

        return gak, y_pred
Exemplo n.º 3
0
def Kernel_K_Means(img_path, sp_met='slic', num_cuts=3):
    
    m_img = imread(img_path)
        
    # superpixels method
    if sp_met == 'felzenszwalb':
        segments = felzenszwalb(m_img, scale=10, sigma=0.5, min_size=100)
    elif sp_met == 'slic':
        segments = slic(m_img, compactness=30, n_segments=400)
    else:
        warnings.warn("Warning Message: no superpixels method parameter")
        
    # image -> eigenvectors
    g = graph.rag_mean_color(m_img, segments)
    w = nx.to_scipy_sparse_matrix(g, format='csc')
    entries = w.sum(axis=0)
    d = sparse.dia_matrix((entries, 0), shape=w.shape).tocsc()
    m = w.shape[0]
    d2 = d.copy()
    d2.data = np.reciprocal(np.sqrt(d2.data, out=d2.data), out=d2.data)
    matrix = d2 * (d - w) * d2

    # matrix eigen-decomposition, scipy.sparse.linalg
    vals, vectors = scipy.sparse.linalg.eigsh(matrix, which='SM', k=min(100, m - 2))
    vals, vectors = np.real(vals), np.real(vectors)
    
    # get first K eigenvectors
    
#     index1, index2, index3 = np.argsort(vals)[0], np.argsort(vals)[1], np.argsort(vals)[2]
#     ev1, ev, ev3 = vectors[:, index1], vectors[:, index2], vectors[:, index3]
    index = np.argsort(vals)[1]
    X_train = vectors[:, index]
    
    # Kernel K-means
    gak_km = GlobalAlignmentKernelKMeans(n_clusters=num_cuts, sigma=sigma_gak(X_train), n_init=20, \
                                         verbose=True, random_state=seed)
    sp_label = gak_km.fit_predict(X_train)
    
    # get pixel label
    p_label, labels = pixels_label(m_img, segments, sp_label)

    return p_label
Exemplo n.º 4
0
from tslearn.metrics import sigma_gak, cdist_gak
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

# tslearn:
seed = 0                                 #물리적데이터를 직접 넣을 수 없으니까, '양자화'하여서 촘촘하게하면 정밀=양이 많아짐. 듬성듬성=양 줄어듬,정확도 떨어짐.
np.random.seed(seed)                     #타임시리즈 데이터, 이산적데이터 ..
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train <4]
np.random.shuffle(X_train)

#          TimeSeriesScalerMeanVariance의 매개변수가 mu(평균)와 std(표준편차) => 정규분포 정규화.
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50])
sz = X_train.shape[1]
gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, sigma=sigma_gak(X_train),    #왜 GlobalAlignmentKernelKMeans를 붙였을까요? 시계열 데이터를 범윅값을 가지고 전 범위 비교. 시계열 데이터를 잴때 정확히 재려면 뭐를 쓴다고 했죠? DTW(Dynamic Time Wraping)을 써서 거리값을 재서 쓴다고 했죠?
        n_init=20, verbose=True, random_state=seed)     #매개변술로 sigma가 들어가고 있어요. 왜 시그마가 들어갔죠? 시그마는 표준편차. X_train이 가지고있는 sigma gak
        #n_init는 클러스터의 중심값을 몇번 바꾸냐는거에요. verbose는 조용히 하라고 하는 것이다.
y_pred = gak_km.fit_predict(X_train)
plt.figure()
for yi in range(3):
    plt.subplot(3, 1, 1+ yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()

plt.show()
Exemplo n.º 5
0
X = to_time_series_dataset(hum_sub)
print(X.shape)
X = TimeSeriesScalerMeanVariance().fit_transform(X)
sz = X.shape[1]

seed = 0
np.random.seed(seed)

nclust = 4
gak_km = GlobalAlignmentKernelKMeans(n_clusters=nclust,
                                     sigma=sigma_gak(X),
                                     n_init=20,
                                     verbose=True,
                                     random_state=seed)
y_pred = gak_km.fit_predict(X)

print(gak_km.inertia_)
print(y_pred + 1)

plt.figure()
for yi in range(nclust):
    plt.subplot(nclust, 1, 1 + yi)
    for xx in X[y_pred == yi]:
        plt.plot(xx.ravel(), "k-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()
plt.show()