class KMeans(): def __init__(self, n_clusters, algorithm='GlobalAlignmentKernelKMeans', random_seed=0): ''' initialize KMeans clustering model with specific kernel hyperparameters: n_clusters: number of clusters in Kmeans model algorithm: which kernel to use for model, options are 'GlobalAlignmentKernelKMeans' and 'TimeSeriesKMeans' random_seed: random seed with which to initialize Kmeans ''' try: assert algorithm == 'GlobalAlignmentKernelKMeans' or algorithm == 'TimeSeriesKMeans' except: raise ValueError( "algorithm must be one of \'GlobalAlignmentKernelKMeans\' or \'TimeSeriesKMeans\'" ) self.n_clusters = n_clusters self.random_seed = random_seed self.algorithm = algorithm self.km = None def fit(self, train): ''' fit KMeans clustering model on training data parameters: train : training time series ''' if self.algorithm == 'TimeSeriesKMeans': self.km = TimeSeriesKMeans(n_clusters=self.n_clusters, n_init=20, verbose=True, random_state=self.random_seed) else: self.km = GlobalAlignmentKernelKMeans( n_clusters=self.n_clusters, sigma=sigma_gak(train), n_init=20, verbose=True, random_state=self.random_seed) return self.km.fit_predict(train) def predict(self, test): ''' clusters for time series in test data set parameters: test: test time series on which to predict clusters returns: clusters for test data set ''' return self.km.predict(test)
def fit(self, train): ''' fit KMeans clustering model on training data parameters: train : training time series ''' if self.algorithm == 'TimeSeriesKMeans': self.km = TimeSeriesKMeans(n_clusters=self.n_clusters, n_init=20, verbose=True, random_state=self.random_seed) else: self.km = GlobalAlignmentKernelKMeans(n_clusters=self.n_clusters, sigma=sigma_gak(train), n_init=20, verbose=True, random_state=self.random_seed) self.km.fit(train)
def test_serialize_global_alignment_kernel_kmeans(): n, sz, d = 15, 10, 3 rng = numpy.random.RandomState(0) X = rng.randn(n, sz, d) gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, verbose=False, max_iter=5) _check_not_fitted(gak_km) gak_km.fit(X) _check_params_predict(gak_km, X, ['predict'])
def test_variable_length_clustering(): # TODO: here we just check that they can accept variable-length TS, not # that they do clever things X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8]]) rng = np.random.RandomState(0) clf = GlobalAlignmentKernelKMeans(n_clusters=2, random_state=rng) clf.fit(X) clf = TimeSeriesKMeans(n_clusters=2, metric="dtw", random_state=rng) clf.fit(X) clf = TimeSeriesKMeans(n_clusters=2, metric="softdtw", random_state=rng) clf.fit(X)
def test_gak_kmeans(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, verbose=False, max_iter=5, random_state=rng).fit(time_series) np.testing.assert_allclose(gak_km.labels_, gak_km.predict(time_series)) gak_km = GlobalAlignmentKernelKMeans(n_clusters=101, verbose=False, max_iter=5, random_state=rng).fit(time_series) assert gak_km._X_fit is None
def do_gak(days, km_size): """ From a time series (as a list of df called days), creates km_size clusters using GAK algo. """ # Arrange data for our lib unq = days["n_day_"].unique() values = [days[days["n_day_"] == l]["val_"].values for l in unq] formatted_dataset = to_time_series_dataset(values) # Configure our kmeans gak = GlobalAlignmentKernelKMeans(n_clusters=km_size, verbose=False, random_state=42) y_pred = gak.fit_predict(formatted_dataset) return gak, y_pred
def Kernel_K_Means(img_path, sp_met='slic', num_cuts=3): m_img = imread(img_path) # superpixels method if sp_met == 'felzenszwalb': segments = felzenszwalb(m_img, scale=10, sigma=0.5, min_size=100) elif sp_met == 'slic': segments = slic(m_img, compactness=30, n_segments=400) else: warnings.warn("Warning Message: no superpixels method parameter") # image -> eigenvectors g = graph.rag_mean_color(m_img, segments) w = nx.to_scipy_sparse_matrix(g, format='csc') entries = w.sum(axis=0) d = sparse.dia_matrix((entries, 0), shape=w.shape).tocsc() m = w.shape[0] d2 = d.copy() d2.data = np.reciprocal(np.sqrt(d2.data, out=d2.data), out=d2.data) matrix = d2 * (d - w) * d2 # matrix eigen-decomposition, scipy.sparse.linalg vals, vectors = scipy.sparse.linalg.eigsh(matrix, which='SM', k=min(100, m - 2)) vals, vectors = np.real(vals), np.real(vectors) # get first K eigenvectors # index1, index2, index3 = np.argsort(vals)[0], np.argsort(vals)[1], np.argsort(vals)[2] # ev1, ev, ev3 = vectors[:, index1], vectors[:, index2], vectors[:, index3] index = np.argsort(vals)[1] X_train = vectors[:, index] # Kernel K-means gak_km = GlobalAlignmentKernelKMeans(n_clusters=num_cuts, sigma=sigma_gak(X_train), n_init=20, \ verbose=True, random_state=seed) sp_label = gak_km.fit_predict(X_train) # get pixel label p_label, labels = pixels_label(m_img, segments, sp_label) return p_label
def fit(self, train): """ fit KMeans clustering model on training data parameters: train : training time series """ if self.algorithm == "TimeSeriesKMeans": self.km = TimeSeriesKMeans( n_clusters=self.n_clusters, n_init=20, verbose=True, random_state=self.random_seed, ) else: self.km = GlobalAlignmentKernelKMeans( n_clusters=self.n_clusters, sigma=sigma_gak(train), n_init=20, verbose=True, random_state=self.random_seed, ) return self.km.fit_predict(train)
from tslearn.clustering import GlobalAlignmentKernelKMeans from tslearn.metrics import sigma_gak, cdist_gak from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance # tslearn: seed = 0 #물리적데이터를 직접 넣을 수 없으니까, '양자화'하여서 촘촘하게하면 정밀=양이 많아짐. 듬성듬성=양 줄어듬,정확도 떨어짐. np.random.seed(seed) #타임시리즈 데이터, 이산적데이터 .. X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train <4] np.random.shuffle(X_train) # TimeSeriesScalerMeanVariance의 매개변수가 mu(평균)와 std(표준편차) => 정규분포 정규화. X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50]) sz = X_train.shape[1] gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, sigma=sigma_gak(X_train), #왜 GlobalAlignmentKernelKMeans를 붙였을까요? 시계열 데이터를 범윅값을 가지고 전 범위 비교. 시계열 데이터를 잴때 정확히 재려면 뭐를 쓴다고 했죠? DTW(Dynamic Time Wraping)을 써서 거리값을 재서 쓴다고 했죠? n_init=20, verbose=True, random_state=seed) #매개변술로 sigma가 들어가고 있어요. 왜 시그마가 들어갔죠? 시그마는 표준편차. X_train이 가지고있는 sigma gak #n_init는 클러스터의 중심값을 몇번 바꾸냐는거에요. verbose는 조용히 하라고 하는 것이다. y_pred = gak_km.fit_predict(X_train) plt.figure() for yi in range(3): plt.subplot(3, 1, 1+ yi) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.title("Cluster %d" % (yi + 1)) plt.tight_layout() plt.show()
hum_sub = np.loadtxt('../../HUM_subs.csv', delimiter=',', skiprows=1) print(hum_sub.shape) X = to_time_series_dataset(hum_sub) print(X.shape) X = TimeSeriesScalerMeanVariance().fit_transform(X) sz = X.shape[1] seed = 0 np.random.seed(seed) nclust = 4 gak_km = GlobalAlignmentKernelKMeans(n_clusters=nclust, sigma=sigma_gak(X), n_init=20, verbose=True, random_state=seed) y_pred = gak_km.fit_predict(X) print(gak_km.inertia_) print(y_pred + 1) plt.figure() for yi in range(nclust): plt.subplot(nclust, 1, 1 + yi) for xx in X[y_pred == yi]: plt.plot(xx.ravel(), "k-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.title("Cluster %d" % (yi + 1))