def merge_crossover(ind1, ind2): """Merge shapelets from one set with shapelets from the other""" # Construct a pairwise similarity matrix using GAK _all = list(ind1) + list(ind2) similarity_matrix = cdist_gak(ind1, ind2, sigma=sigma_gak(_all)) # Iterate over shapelets in `ind1` and merge them with shapelets # from `ind2` for row_idx in range(similarity_matrix.shape[0]): # Remove all elements equal to 1.0 mask = similarity_matrix[row_idx, :] != 1.0 non_equals = similarity_matrix[row_idx, :][mask] if len(non_equals): # Get the timeseries most similar to the one at row_idx max_col_idx = np.argmax(non_equals) ts1 = list(ind1[row_idx]).copy() ts2 = list(ind2[max_col_idx]).copy() # Merge them and remove nans ind1[row_idx] = euclidean_barycenter([ts1, ts2]) ind1[row_idx] = ind1[row_idx][~np.isnan(ind1[row_idx])] # Apply the same for the elements in ind2 for col_idx in range(similarity_matrix.shape[1]): mask = similarity_matrix[:, col_idx] != 1.0 non_equals = similarity_matrix[:, col_idx][mask] if len(non_equals): max_row_idx = np.argmax(non_equals) ts1 = list(ind1[max_row_idx]).copy() ts2 = list(ind2[col_idx]).copy() ind2[col_idx] = euclidean_barycenter([ts1, ts2]) ind2[col_idx] = ind2[col_idx][~np.isnan(ind2[col_idx])] return ind1, ind2
def fit(self, train): ''' fit KMeans clustering model on training data parameters: train : training time series ''' if self.algorithm == 'TimeSeriesKMeans': self.km = TimeSeriesKMeans(n_clusters=self.n_clusters, n_init=20, verbose=True, random_state=self.random_seed) else: self.km = GlobalAlignmentKernelKMeans(n_clusters=self.n_clusters, sigma=sigma_gak(train), n_init=20, verbose=True, random_state=self.random_seed) self.km.fit(train)
def fit(self, train): """ fit KMeans clustering model on training data parameters: train : training time series """ if self.algorithm == "TimeSeriesKMeans": self.km = TimeSeriesKMeans( n_clusters=self.n_clusters, n_init=20, verbose=True, random_state=self.random_seed, ) else: self.km = GlobalAlignmentKernelKMeans( n_clusters=self.n_clusters, sigma=sigma_gak(train), n_init=20, verbose=True, random_state=self.random_seed, ) return self.km.fit_predict(train)
def fit(self, X, y=None, sample_weight=None): """Compute kernel k-means clustering. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. y Ignored sample_weight : array-like of shape=(n_ts, ) or None (default: None) Weights to be given to time series in the learning process. By default, all time series weights are equal. """ if self.sigma != 1.: warnings.warn( "Setting `sigma` directly as a parameter for KernelKMeans " "and GlobalAlignmentKernelKMeans is deprecated in version " "0.4 and will be removed in 0.6. Use `kernel_params` " "instead.", DeprecationWarning, stacklevel=2) X = check_array(X, allow_nd=True, force_all_finite=False) X = check_dims(X) sample_weight = _check_sample_weight(sample_weight=sample_weight, X=X) max_attempts = max(self.n_init, 10) kernel_params = self._get_kernel_params() if self.kernel == "gak": self.sigma_gak_ = kernel_params.get("sigma", 1.) if self.sigma_gak_ == "auto": self.sigma_gak_ = sigma_gak(X) else: self.sigma_gak_ = None self.labels_ = None self.inertia_ = None self.sample_weight_ = None self._X_fit = None # n_iter_ will contain the number of iterations the most # successful run required. self.n_iter_ = 0 n_samples = X.shape[0] K = self._get_kernel(X) sw = (sample_weight if sample_weight is not None else numpy.ones(n_samples)) self.sample_weight_ = sw rs = check_random_state(self.random_state) last_correct_labels = None min_inertia = numpy.inf n_attempts = 0 n_successful = 0 while n_successful < self.n_init and n_attempts < max_attempts: try: if self.verbose and self.n_init > 1: print("Init %d" % (n_successful + 1)) n_attempts += 1 self._fit_one_init(K, rs) if self.inertia_ < min_inertia: last_correct_labels = self.labels_ min_inertia = self.inertia_ self.n_iter_ = self._iter n_successful += 1 except EmptyClusterError: if self.verbose: print("Resumed because of empty cluster") if n_successful > 0: self.labels_ = last_correct_labels self.inertia_ = min_inertia self._X_fit = X return self
from tslearn.clustering import GlobalAlignmentKernelKMeans from tslearn.metrics import sigma_gak, cdist_gak from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance # tslearn: seed = 0 #물리적데이터를 직접 넣을 수 없으니까, '양자화'하여서 촘촘하게하면 정밀=양이 많아짐. 듬성듬성=양 줄어듬,정확도 떨어짐. np.random.seed(seed) #타임시리즈 데이터, 이산적데이터 .. X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train <4] np.random.shuffle(X_train) # TimeSeriesScalerMeanVariance의 매개변수가 mu(평균)와 std(표준편차) => 정규분포 정규화. X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50]) sz = X_train.shape[1] gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, sigma=sigma_gak(X_train), #왜 GlobalAlignmentKernelKMeans를 붙였을까요? 시계열 데이터를 범윅값을 가지고 전 범위 비교. 시계열 데이터를 잴때 정확히 재려면 뭐를 쓴다고 했죠? DTW(Dynamic Time Wraping)을 써서 거리값을 재서 쓴다고 했죠? n_init=20, verbose=True, random_state=seed) #매개변술로 sigma가 들어가고 있어요. 왜 시그마가 들어갔죠? 시그마는 표준편차. X_train이 가지고있는 sigma gak #n_init는 클러스터의 중심값을 몇번 바꾸냐는거에요. verbose는 조용히 하라고 하는 것이다. y_pred = gak_km.fit_predict(X_train) plt.figure() for yi in range(3): plt.subplot(3, 1, 1+ yi) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.title("Cluster %d" % (yi + 1)) plt.tight_layout() plt.show()
from tslearn.clustering import GlobalAlignmentKernelKMeans hum_sub = np.loadtxt('../../HUM_subs.csv', delimiter=',', skiprows=1) print(hum_sub.shape) X = to_time_series_dataset(hum_sub) print(X.shape) X = TimeSeriesScalerMeanVariance().fit_transform(X) sz = X.shape[1] seed = 0 np.random.seed(seed) nclust = 4 gak_km = GlobalAlignmentKernelKMeans(n_clusters=nclust, sigma=sigma_gak(X), n_init=20, verbose=True, random_state=seed) y_pred = gak_km.fit_predict(X) print(gak_km.inertia_) print(y_pred + 1) plt.figure() for yi in range(nclust): plt.subplot(nclust, 1, 1 + yi) for xx in X[y_pred == yi]: plt.plot(xx.ravel(), "k-") plt.xlim(0, sz) plt.ylim(-4, 4)
import numpy import matplotlib.pyplot as plt from tslearn.clustering import GlobalAlignmentKernelKMeans from tslearn.metrics import sigma_gak, cdist_gak from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance seed = 0 numpy.random.seed(seed) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train < 4] # Keep first 3 classes numpy.random.shuffle(X_train) X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50]) # Keep only 50 time series sz = X_train.shape[1] gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, sigma=sigma_gak(X_train), n_init=20, verbose=True, random_state=seed) y_pred = gak_km.fit_predict(X_train) plt.figure() for yi in range(3): plt.subplot(3, 1, 1 + yi) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.title("Cluster %d" % (yi + 1)) plt.tight_layout() plt.show()
from tslearn.clustering import GlobalAlignmentKernelKMeans from tslearn.metrics import sigma_gak, cdist_gak from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance seed = 0 numpy.random.seed(seed) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train < 4] # Keep first 3 classes numpy.random.shuffle(X_train) X_train = TimeSeriesScalerMeanVariance().fit_transform( X_train[:50]) # Keep only 50 time series sz = X_train.shape[1] gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, sigma=sigma_gak(X_train), n_init=20, verbose=True, random_state=seed) y_pred = gak_km.fit_predict(X_train) plt.figure() for yi in range(3): plt.subplot(3, 1, 1 + yi) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.title("Cluster %d" % (yi + 1)) plt.tight_layout()