def elbowmethod(df: pd.DataFrame): """ Function that finds the best number of clusters and plots a graph of the distortion with the number of clusters Args: pd.DataFrame: Columns are (country(index), year_week, value) Returns: int: Best number of cluster """ import numpy as np from tslearn.clustering import silhouette_score import matplotlib.pyplot as plt distortions = [] inertias = [] mapping1 = {} mapping2 = {} K = range(2, df.shape[0]) for k in K: # Building and fitting the model model = TimeSeriesKMeans(n_clusters=k, metric="softdtw", max_iter=50) model.fit(df.values[..., np.newaxis]) distortions.append( silhouette_score(df, model.labels_, metric="softdtw")) plt.plot(K, distortions, 'bx-') plt.xlabel('Values of K') plt.ylabel('Distortion') plt.title('The Elbow Method using Distortion') plt.show() best_num_cluster = np.argmax(distortions) + 2 return best_num_cluster
def test_serialize_timeserieskmeans(): n, sz, d = 15, 10, 3 rng = numpy.random.RandomState(0) X = rng.randn(n, sz, d) dba_km = TimeSeriesKMeans(n_clusters=3, n_init=2, metric="dtw", verbose=True, max_iter_barycenter=10) _check_not_fitted(dba_km) dba_km.fit(X) _check_params_predict(dba_km, X, ['predict']) sdtw_km = TimeSeriesKMeans(n_clusters=3, metric="softdtw", metric_params={"gamma": .01}, verbose=True) _check_not_fitted(sdtw_km) sdtw_km.fit(X) _check_params_predict(sdtw_km, X, ['predict'])
def extract_clusters(self, new_featues): print("Extracting clusters ...") km = TimeSeriesKMeans(n_clusters=2, random_state=42) km.fit(new_featues) y_label = km.labels_ new_featues['km_clusters'] = y_label print("Extracting clusters ... DONE.") return new_featues
def cluster_annotation_dimension(data, n_clusters=3): data = np.array([list(moving_average(d, 2)) for d in data]) clustering = TimeSeriesKMeans(n_clusters=n_clusters, metric="euclidean") clustering.fit(data) clusters = [] for cluster_ix in range(n_clusters): cl = np.where(clustering.labels_ == cluster_ix)[0] clusters.append(data[cl]) return clusters
def init(X, l, k): # Good initial start improves the convergence speed seed = 0 sdtw_km = TimeSeriesKMeans(n_clusters=k, metric="euclidean", max_iter=10, random_state=seed) sdtw_km.fit(X) G_init = np.zeros((sdtw_km.labels_.size, sdtw_km.labels_.max() + 1)) G_init[np.arange(sdtw_km.labels_.size), sdtw_km.labels_] = 1 G_init = G_init + np.random.rand(G_init.shape[0], G_init.shape[1]) F_init = sdtw_km.cluster_centers_[:, :, 0] + 2**psi * np.random.rand(k, l) return F_init, G_init
def get_fitted_model(dataset, clusters = 3, timepoints = 0, verbose = False, return_dataset = True): if timepoints!=0: dataset = dataset[:,:timepoints] if verbose: print(f'Segmenting data into {clusters} clusters...') model = TimeSeriesKMeans( n_clusters = clusters, n_init = 10, metric = 'dtw', #Dynamic time warping verbose = verbose, n_jobs = -1 #Use all cores ) model.fit(dataset) return model
def main(): X1 = to_time_series_dataset(mock_dataset_muscle1) y1 = mock_labels X_train1 = X1[:-2] y_train1 = y1[:-2] X_test1 = X1[-2:] y_test1 = y1[-2:] # clf1 = KNeighborsTimeSeriesClassifier(n_neighbors=5, metric="dtw") clf1 = TimeSeriesKMeans(metric="dtw") clf1.fit(X_train1, y_train1) pred_train1 = clf1.predict(X_train1) pred_test1 = clf1.predict(X_test1) print("TRAINING SET 1") print("Prediction: " + str(pred_test1)) print("Actual: " + str(y_test1)) print("\n") X2 = to_time_series_dataset(mock_dataset_muscle2) y2 = mock_labels X_train2 = X2[:-2] y_train2 = y2[:-2] X_test2 = X2[-2:] y_test2 = y2[-2:] clf2 = TimeSeriesKMeans(metric="dtw") # clf2 = KNeighborsTimeSeriesClassifier(n_neighbors=5, metric="dtw") clf2.fit(X_train2, y_train2) pred_train2 = clf2.predict(X_train2) pred_test2 = clf2.predict(X_test2) print("TRAINING SET 2") print("Prediction: " + str(pred_test2)) print("Actual: " + str(y_test2)) print("\n") times_train = mock_times[:-2] times_test = mock_times[-2:] X_train = np.stack((pred_train1, pred_train2, times_train)).transpose() X_test = np.stack((pred_test1, pred_test2, times_test)).transpose() y_train = np.array(mock_labels[:-2]).reshape((len(X_train), )) y_test = mock_labels[-2:] sgd = SGDClassifier() sgd.fit(X_train, y_train) pred = sgd.predict(X_test) print("ENSEMBLE") print("Prediction: " + str(pred)) print("Actual: " + str(y_test)) print("Score: " + str(sgd.score(X_test, y_test)))
def run(): parser = cli_parser() args = parser.parse_args() nii = image.index_img(args.input, slice(0, 30)) masker = input_data.NiftiMasker() data = masker.fit_transform(nii) ds = to_time_series_dataset(data.T[::80, :]) model = TimeSeriesKMeans(n_clusters=2, metric="dtw", max_iter=15) model.fit(ds) all = to_time_series_dataset(data.T) mask = model.predict(all) mask_nii = masker.inverse_transform(mask) mask.nii.to_filename(args.output)
def get_fitted_model(dataset, clusters=3, start=0, end=0, verbose=False, return_dataset=True): if end != 0 or start != 0: dataset = dataset[:, start:end] if verbose: print(f'Segmenting data into {clusters} clusters...') metric_params = {'global_constraint': 'sakoe_chiba'} model = TimeSeriesKMeans( n_clusters=clusters, n_init=1, metric='dtw', #Dynamic time warping verbose=verbose, n_jobs=-1, #Use all cores metric_params=metric_params) model.fit(dataset) return model
def multi_plot(row, col, fs_tuple, sy_bool, sx_bool, X, num_cluster, lineW, ts=False, labels = None): if ts==True: # model generation model=TimeSeriesKMeans(n_clusters = num_cluster, tol=1e-05, metric='euclidean') fitted_model = model.fit(X) labels = fitted_model.predict(X) f, axes = plt.subplots(row, col, figsize=fs_tuple, sharey=sy_bool, sharex=sx_bool) labelsize=10 fontsize=10 cluster_pool = np.unique(labels) for index, i_cluster in enumerate(cluster_pool): sub_mat = X[labels==i_cluster, :] # unravel figrow, figcol = np.unravel_index(index, dims=[row, col]) # plot if row > 1 and col > 1: for iCurve in range(np.shape(sub_mat)[0]): axes[figrow,figcol].plot(sub_mat[iCurve,:], 'r', linewidth=lineW) # after plot, modify the axes for i_col in range(col): axes[-1,i_col].set_xticks([0,16,32,48,64,80]) axes[-1,i_col].set_xticklabels(str(300+80*(i)) for i in np.arange(6)) axes[-1,i_col].set_xlabel('Wavelength [nm]', fontsize=fontsize) axes[-1,i_col].tick_params(axis='x', labelsize=labelsize) for i_row in range(row): axes[i_row,0].set_yticks([-1,0,1]) axes[i_row,0].tick_params(axis='y', labelsize=labelsize) elif row > 1 and col == 1: for i_curve in range(np.shape(sub_mat)[0]): axes[figrow].plot(sub_mat[i_curve,:], 'r', linewidth=lineW) axes[-1,0].set_xticks([0,16,32,48,64,80]) axes[-1,0].set_xticklabels(str(300+80*(i)) for i in np.arange(6)) axes[-1,0].set_xlabel('Wavelength [nm]', fontsize=fontsize) axes[-1,0].tick_params(axis='x', labelsize=labelsize) for i_row in range(row): axes[i_row,0].set_yticks([-1,0,1]) axes[i_row,0].tick_params(axis='y', labelsize=labelsize) elif row == 1 and col > 1: for i_curve in range(np.shape(sub_mat)[0]): axes[figcol].plot(sub_mat[i_curve,:], 'r', linewidth=lineW) for i_col in range(col): axes[-1,i_col].set_xticks([0,16,32,48,64,80]) axes[-1,i_col].set_xticklabels(str(300+80*(i)) for i in np.arange(6)) axes[-1,i_col].set_xlabel('Wavelength [nm]', fontsize=fontsize) axes[-1,i_col].tick_params(axis='x', labelsize=labelsize) axes[0,0].set_yticks([-1,0,1]) axes[0,0].tick_params(axis='y', labelsize=labelsize) return (f, axes)
def calc_kmeans(df_scaled, metric, n_clusters, name): file_name = 'models/ts_{}_{}.pickle'.format(name, n_clusters) if not path.exists(file_name): ts_kmeans = TimeSeriesKMeans(n_clusters=n_clusters, metric=metric, n_jobs=20, max_iter=10) ts_kmeans.fit(df_scaled) with open(file_name, 'wb') as f: pickle.dump(ts_kmeans, f) else: ts_kmeans = pickle.load(open(file_name, 'rb')) for cluster_number in range(n_clusters): plt.plot(ts_kmeans.cluster_centers_[cluster_number, :, 0].T, label=cluster_number) plt.title("Cluster centroids") plt.legend() plt.show() return ts_kmeans
def find_kmeans(df_scaled, metric, clasters): distortions = [] silhouette = [] daviesbouldin = [] K = range(1, clasters) for k in tqdm(K): kmeanModel = TimeSeriesKMeans(n_clusters=k, metric=metric, n_jobs=20, max_iter=10) #kmeanModel = TimeSeriesKMeans(n_clusters=k, metric="euclidean", n_jobs=6, max_iter=10) kmeanModel.fit(df_scaled) distortions.append(kmeanModel.inertia_) if k > 1: silhouette.append(silhouette_score(df_scaled, kmeanModel.labels_)) daviesbouldin.append( davies_bouldin_score(df_scaled, kmeanModel.labels_)) plt.figure(figsize=(10, 4)) plt.plot(K, distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('Elbow Method') plt.show() plt.figure(figsize=(10, 4)) plt.plot(K[1:], silhouette, 'bx-') plt.xlabel('k') plt.ylabel('Silhouette score') plt.title('Silhouette') plt.show() plt.figure(figsize=(10, 4)) plt.plot(K[1:], daviesbouldin, 'bx-') plt.xlabel('k') plt.ylabel('Davies-Bouldin score') plt.title('Davies-Bouldin') plt.show()
def tsKMeans_num_cluster(X, n_trials, max_n_cluster): min_n_cluster = 2 v_clusters = np.arange(min_n_cluster, max_n_cluster) n_seeds = n_trials # recorder sc_recorder = np.zeros((len(v_clusters),n_seeds)) for i_seed in range(n_seeds): for num_cluster in v_clusters: model=TimeSeriesKMeans(n_clusters = num_cluster, tol=1e-05, metric='euclidean', random_state=i_seed) fitted_model = model.fit(X) y_pred= fitted_model.predict(X) s_sc = sklearn.metrics.silhouette_score(X, y_pred, metric='euclidean') sc_recorder[num_cluster-min_n_cluster, i_seed]=s_sc return sc_recorder
def construct_model(best_num_cluster: int, df: pd.DataFrame): """ Function that constructs the model, tests the accuracy of the model and produces cluster plots using the model Args: pd.DataFrame: Columns are (country(index), year_week, value) int: Best number of cluster Returns: model """ import numpy as np from tslearn.clustering import silhouette_score import matplotlib.pyplot as plt model = TimeSeriesKMeans(n_clusters=best_num_cluster, metric="softdtw", max_iter=50) #### # Test whether the clusters are stable or not model1 = TimeSeriesKMeans(n_clusters=best_num_cluster, metric="softdtw", max_iter=50) model1.fit(df.values[..., np.newaxis]) # we need the output to be visualized model2 = TimeSeriesKMeans(n_clusters=best_num_cluster, metric="softdtw", max_iter=50) model2.fit(df.values[..., np.newaxis]) # we need the output to be visualized print( normalized_mutual_info_score(model1.labels_, model2.labels_) ) # scale the results between 0 (no mutual information) and 1 (perfect correlation) ###### Plotting the model ###### model.fit(df.values[..., np.newaxis]) plt.figure() sz = newcases_df.shape[1] ylim = newcases_df.values.max() for yi in range(best_num_cluster): plt.subplot(best_num_cluster, best_num_cluster, yi + 1) #for xx in X_train[y_pred == yi]: #plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(model.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(0, ylim) plt.text(0.55, 0.85, 'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) if yi == 1: plt.title("DTW $k$-means") return model
def kmeans(n_shapelets, shp_len, n_draw=1000): """Sample subseries from the timeseries and apply K-Means on them""" # Sample `n_draw` subseries of length `shp_len` n_ts, sz = len(X), self._min_length indices_ts = np.random.choice(n_ts, size=n_draw, replace=True) start_idx = np.random.choice(sz - shp_len + 1, size=n_draw, replace=True) end_idx = start_idx + shp_len subseries = np.zeros((n_draw, shp_len)) for i in range(n_draw): subseries[i] = X[indices_ts[i]][start_idx[i]:end_idx[i]] tskm = TimeSeriesKMeans(n_clusters=n_shapelets, metric="euclidean", verbose=False) return tskm.fit(subseries).cluster_centers_
def kmeans(X, n_shapelets, min_len, max_len, n_draw=None): """Sample subseries from the timeseries and apply K-Means on them""" # Sample `n_draw` subseries of length `shp_len` if n_shapelets == 1: return random_shapelet(X, n_shapelets, min_len, max_len) if n_draw is None: n_draw = max(n_shapelets, int(np.sqrt(len(X)))) shp_len = np.random.randint(4, min(min_len, max_len)) indices_ts = np.random.choice(len(X), size=n_draw, replace=True) start_idx = np.random.choice(min_len - shp_len, size=n_draw, replace=True) end_idx = start_idx + shp_len subseries = np.zeros((n_draw, shp_len)) for i in range(n_draw): subseries[i] = X[indices_ts[i]][start_idx[i]:end_idx[i]] tskm = TimeSeriesKMeans(n_clusters=n_shapelets, metric="euclidean", verbose=False) return tskm.fit(subseries).cluster_centers_
def subseqeuence_clustering(sequence, changepoints, y_label='y', norm=False): """ Clusters subsequences of time series indicated by the changepoints variable. Uses silhouette score to determine the number of clusters :param y_label: Name of y-label in plot :param norm: normlise data using MinMaxScaler :param sequence: np array of the time series :param changepoints: detected changepoints on which subseuqences are build :return: """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax sub_ids = [] x_index = [] X = [] i = 0 end_p = [len(sequence) - 1] for cp in changepoints + end_p: X.append(sequence[i:cp]) index = 'sub_' + str(i) + '_' + str(cp) sub_ids.append(index) x_index.append([x_id for x_id in range(i, cp + 1)]) i = cp # Normalize the data (y = (x - min) / (max - min)) if norm: X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(changepoints)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) print('Number of Clusters in subsequence clustering: ' + str(opt_k)) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(sub_ids, x_index, model.labels_)), columns=['metric', 'x_index', 'cluster']) cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() print('Plotting Clusters') # plot changepoints as vertical lines for cp in changepoints: plt.axvline(x=cp, ls=':', lw=2, c='0.65') # preprocessing for plotting cluster based x_scat = [] y_scat = [] cluster = [] for index, row in df_cluster.iterrows(): x_seq = row['x_index'] x_scat.extend(x_seq) y_seq = sequence[x_seq[0]:x_seq[-1] + 1] y_scat.extend(y_seq) label_seq = [row['cluster']] cluster.extend(label_seq * len(x_seq)) # plt.scatter(x_seq, y_seq, label=label_seq) # plotting cluster based x_scat = np.array(x_scat) y_scat = np.array(y_scat) for c in np.unique(cluster): i = np.where(cluster == c) plt.scatter(x_scat[i], y_scat[i], label=c) plt.legend() plt.title('Subsequence k-means Clustering') plt.xlabel('Time index') plt.ylabel(y_label) plt.show() return cluster_metrics_dict
model = TimeSeriesKMeans(n_clusters=2, metric='dtw', verbose=1) fig, axes = plt.subplots(1, 3) for axis in axes: axis.set_xlabel('Time') axis.set_ylabel('Signal') axis.set_ylim((-1.1, 1.1)) for datum in data[0:5]: axes[0].plot(datum, color='green') for datum in data[300:305]: axes[1].plot(datum, color='red') axes[0].set_title('Examples of training data') axes[1].set_title('Examples of training data') model.fit(data) sines_cluster_1 = 0 tris_cluster_1 = 0 sines_cluster_2 = 0 tris_cluster_2 = 0 #this comparator is broken!! for label in model.labels_[:300]: if label: sines_cluster_1 += 1 else: tris_cluster_1 += 1 for label in model.labels_[300:]: if label: sines_cluster_2 += 1 else:
class SKU_Clusterer: def __init__(self, *args, **kwargs): #clustering params self.classifier=None self.clusters_indices = {} self.n_clusters = int(kwargs['n_clusters']) self.use_kmeans = self.n_clusters > 0 self.kmeans_iterations = int(kwargs['k_means_n_iterations']) self.k_means_metric = kwargs.get('k_means_metric', 'euclidean') if self.k_means_metric not in ['dtw', 'euclidean', 'softdtw']: print('invalid k_means metric, seting to `euclidean`', verbosity=1) self.k_means_metric = 'euclidean' #RNN params self.n_epochs = [int(p) for p in (kwargs['rnn_epochs'].split(';'))] self.n_steps = [int(p) for p in (kwargs['n_steps']).split(';')][0] self.encoder_output_units = [int(p) for p in kwargs['encoder_output_units'].split(';')] self.decoder_output_units = [int(p) for p in kwargs['decoder_output_units'].split(';')] self.batch_size = [int(p) for p in kwargs['batch_size'].split(';')] self.early_stopping = [kwargs['early_stopping'].split(';')] self.discriminative_cols = kwargs.get('discriminative_columns', None) if self.discriminative_cols: self.discriminative_cols = self.discriminative_cols.strip().split(';') #paths self.sku_path = kwargs['sku_path'] self.autoencoder_path = './models/autoencoder.pkl' self.encoder_path = './models/encoder.pkl' self.decoder_path = './models/decoder.pkl' self.classifier_path = './models/classifier.pkl' self.kmeans_path = './models/kmeans_model.pkl' self.embedder_path = './models/embedder.pkl' self.config_path = './models/clusterer_config.pkl' #other params self.full_dataset = kwargs.get('full_dataset', False) self.cold_start = True if kwargs['cold_start'] == 'True' else False self.encoding = kwargs.get('encoding', 'utf8') self._load_datasets = self._load_datasets_full if self.full_dataset == 'True' else self._load_datasets_partial if not self.cold_start: self.load_configuration() def filter_dataset(self, df): chosen_cols = [] for c in self.discriminative_cols: if c not in df.columns: print(f'invalid column name: `{c}`, omitting...', verbosity=1) else: chosen_cols.append(c) self.discriminative_cols = chosen_cols if self.discriminative_cols != []: print(f'RUNNING FILTERING on columns:{", ".join(self.discriminative_cols)}') df = df.filter(items = self.discriminative_cols) else: print('No discriminative columns passed, running algoritm on all columns') return df def _load_datasets_partial(self): datasets = [] for file in os.listdir(self.sku_path): df = pd.read_csv(os.path.join(self.sku_path, file), encoding=self.encoding, sep=';') df = self.filter_dataset(df) n_splits = df.shape[0] // self.n_steps trim = df.shape[0] % self.n_steps df = df[trim:] for split_idx in range(n_splits): chunk = df[split_idx * self.n_steps : (split_idx + 1) * self.n_steps] datasets.append(chunk.values) return np.array(datasets, dtype=np.float64) def _load_datasets_full(self): datasets = [] for file in os.listdir(self.sku_path): df = pd.read_csv(os.path.join(self.sku_path, file), encoding=self.encoding, sep=';') df = self.filter_dataset(df) for offset in range(df.shape[0] - self.n_steps): chunk = df[offset : offset + self.n_steps] datasets.append(chunk.values) return np.array(datasets, dtype=np.float64) def load_configuration(self): if not os.path.exists(self.config_path): print('Config file not found...', verbosity=1) return config = open(self.config_path, "rb") self.clusters_indices = load(config) self.n_clusters = load(config) self.use_kmeans = load(config) self.train_dataset = load(config) config.close() def save_configuration(self): config = open(self.config_path, "wb") dump(self.clusters_indices, config) dump(self.n_clusters, config) dump(self.use_kmeans, config) dump(self.train_dataset, config) config.close() def load_models(self, cold_start=False): models_exists = os.path.isfile(self.autoencoder_path) \ and os.path.isfile(self.encoder_path) \ and os.path.isfile(self.decoder_path) classifier_exists = os.path.isfile(self.classifier_path) kmeans_exists = os.path.isfile(self.kmeans_path) embedder_exists = os.path.isfile(self.embedder_path) if not (models_exists and classifier_exists): print('NO MODELS FOUND, COLD START REQUIRED...', verbosity=1) if not cold_start and models_exists: print('AUTOENCODER MODELS EXISTS, LOADING...') self.autoenc = load_model(self.autoencoder_path) self.enc = load_model(self.encoder_path) self.dec = load_model(self.decoder_path) if not cold_start and classifier_exists: print('CLASSIFIER MODEL EXISTS, LOADING...') with open(self.classifier_path, 'rb') as model_file: self.classifier = load(model_file) if not cold_start and kmeans_exists: print('K_MEANS MODEL EXISTS, LOADING...') with open(self.kmeans_path, 'rb') as model_file: self.k_means_classifier = load(model_file) if not cold_start and embedder_exists: with open(self.embedder_path, 'rb') as model_file: self.embedder = load(model_file) return models_exists and classifier_exists and embedder_exists and not cold_start def train(self, dataset=None): if dataset is None: dataset = self._load_datasets() n_features = dataset.shape[-1] if not self.load_models(self.cold_start): #Talos scan params = { 'n_steps':[self.n_steps], 'n_features':[n_features], 'epochs':self.n_epochs, 'enc_units':self.encoder_output_units, 'dec_units':self.decoder_output_units, 'batch_size':self.batch_size, 'early_stopping':self.early_stopping, 'scan':[True] } results = talos.Scan(dataset, np.zeros_like(dataset), params=params, model=create_autoencoder_models) best_params = results.data.sort_values(by=['val_loss'], ascending=True).iloc[0].to_dict() best_params['scan'] = False print('\n', '='*30, '\nBEST AUTOENCODER HYPERPARAMETERS:\n', '\n'.join([f'{key} = {value}' for key,value in best_params.items()]), '\n', '='*30) self.autoenc, self.enc, self.dec = create_autoencoder_models(dataset, np.zeros_like(dataset), params=best_params) hist = self.autoenc.history.history loss = hist['loss'] val_loss = hist['val_loss'] plt.figure(figsize=(10, 7)) plt.plot(loss, label='training_loss') plt.plot(val_loss, label='validation_loss') plt.legend() plt.title('Autoencoder loss') plt.savefig('./loss/autoencoder_loss.png') self.train_dataset = dataset classifier_inputs = self.enc.predict(dataset) self.embedder = TSNE(n_components=2, perplexity=40, random_state=42) embedded = self.embedder.fit_transform(classifier_inputs) if not self.use_kmeans: print('CLUSTER COUNT NOT SPECIFIED, CALCULATING CLUSTER NUMBER...', verbosity=1) self.u_classifier = DBSCAN(eps=3, n_jobs=-1) classes = self.u_classifier.fit_predict(embedded) self.n_clusters = len(set(classes)) self.use_kmeans = True self.k_means_classifier = TimeSeriesKMeans(n_clusters=self.n_clusters, metric=self.k_means_metric, n_init=self.kmeans_iterations, verbose=True, max_iter=1000) self.k_means_classifier.fit(embedded) self.k_means_classifier.transform = self.k_means_classifier.predict #hotfix self.clusters_indices = self.k_means_classifier.fit_predict(embedded) self.classifier = KNeighborsClassifier(n_neighbors=5, n_jobs=-1) self.classifier.fit(embedded, self.clusters_indices) with open(self.classifier_path, 'wb') as model_file: dump(self.classifier, model_file) with open(self.embedder_path, 'wb') as model_file: dump(self.embedder, model_file) with open(self.kmeans_path, 'wb') as model_file: dump(self.k_means_classifier, model_file) self.save_configuration() # ============================================================================= # Cluster visualisation # ============================================================================= clusters = self.k_means_classifier.transform(embedded) unique_clusters = set(clusters) plt.figure() for clas in unique_clusters: # for clas in unique_clusters: c = generate_color() mask = clusters == clas filtered = embedded[mask] plt.scatter(filtered[:, 0], filtered[:, 1], c=c, label=f'cluster {clas + 1}') plt.legend() plt.savefig('./clusters/clusters.png') def embed(self, dataset): flattened = self.enc.predict(dataset) embedded = self.embedder.fit_transform(flattened) return embedded def predict(self, sample): result = self.enc.predict(sample) return result def predict_class(self, sample, plot_cluster=False): extended_dataset = np.vstack(( self.train_dataset, sample.reshape(-1, *sample.shape) )) embedded_space = self.embed(extended_dataset) sample_coords = embedded_space[-1] nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(embedded_space[:-1]) distances, indices = nbrs.kneighbors(sample_coords.reshape(1, -1)) n_classes, classes_counts = np.unique(self.clusters_indices[indices], return_counts = True) cls = n_classes[np.argmax(np.unique(classes_counts))] print(distances) print(indices) print(self.clusters_indices[indices]) print(cls) if plot_cluster: plt.figure() plt.scatter(embedded_space[:,0], embedded_space[:,1]) plt.scatter(sample_coords[0], sample_coords[1], marker='x', c='red') return cls, distances, indices def compress_dataset(self, dataset): return self.enc.predict(dataset) def cluster(self, dataset, sample=None, plot_clusters=False): if sample is not None: dataset = np.vstack((sample, dataset)) compressed_dataset = self.compress_dataset(dataset) embedded_dataset = self.embedder.fit_transform(compressed_dataset) classes = self.k_means_classifier.fit_predict(embedded_dataset) if plot_clusters: plt.figure() unique_clusters = set(classes) for clas in unique_clusters: c = generate_color() mask = classes == clas filtered = embedded_dataset[mask] plt.scatter(filtered[:, 0], filtered[:, 1], c=c, label=f'cluster {clas + 1}') if sample is not None: plt.scatter(embedded_dataset[0, 0], embedded_dataset[0, 1], c='red', marker='x') plt.legend() return dataset, classes
#y_kmeans = kmeans.predict(my_array) #plt.scatter(my_array[:,0],my_array[:,3],c = y_kmeans, s=50, cmap='viridis') #plt.show() centroids = kmeans.cluster_centers_ labels = kmeans.labels_ #print(my_array[500,3]) print(centroids) print(len(labels))''' no_clust = 10 t_series = to_time_series(my_array) kmeans = TimeSeriesKMeans(n_clusters=no_clust, metric="euclidean", max_iter=8, random_state=0) kmeans.fit(t_series) print("The cluster centers are:", kmeans.cluster_centers_) print("Each time series belongs to:", kmeans.labels_) labels = kmeans.labels_ y_kmeans = kmeans.predict(t_series) plt.scatter(t_series[:, 0, 1], [2 for _ in range(length)], c=y_kmeans, s=30, cmap='viridis') plt.scatter(t_series[:, 182, 1], [1.5 for _ in range(length)], c=y_kmeans, s=30, cmap='viridis') plt.scatter(t_series[:, 364, 1], [1 for _ in range(length)], c=y_kmeans,
def k_means_clustering(sd_log): """ k_means clustering of all features using dtw for multivariate time series :param sd_log: sd_log object :return: cluster_metrics_dict: dict with clusters as key and features as values """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax data = sd_log.data # TODO handle outliers tmp = sd_log.waiting_time data.drop(columns=[sd_log.waiting_time], inplace=True) X = [] # Get data as numpy array for col in data.columns: X.append(sd_log.get_points(col)) # Normalize the data (y = (x - min) / (max - min)) data_norm = data.copy() for column in data_norm.columns: data_norm[column] = (data_norm[column] - data_norm[column].min()) / ( data_norm[column].max() - data_norm[column].min()) X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(data.columns)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)), columns=['metric', 'cluster']) # make some helper dictionaries and lists cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() cluster_len_dict = df_cluster['cluster'].value_counts().to_dict() clusters_dropped = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] == 1 ] clusters_final = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] > 1 ] print('Plotting Clusters') fig, axs = plt.subplots(opt_k) # , figsize=(10, 5)) # fig.suptitle('Clusters') row_i = 0 # column_j = 0 # For each label there is, # plots every series with that label for cluster in cluster_metrics_dict: for feat in cluster_metrics_dict[cluster]: axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4) axs[row_i].legend(loc="best") if len(cluster_metrics_dict[cluster]) > 100: # TODO draw mean in red if more than one cluster tmp = np.nanmean(np.vstack(cluster), axis=1) axs[row_i].plot(tmp, c="red") axs[row_i].set_title("Cluster " + str(cluster)) row_i += 1 # column_j += 1 # if column_j % k == 0: # row_i += 1 # column_j = 0 plt.show() # return dict {cluster_id: features} return cluster_metrics_dict
D1, D2, A2 = DWT_db2(data) DWTed_test.append(D1); DWTed_test.append(D1); DWTed_test.append(A2) test.append(data[0:1000].to_numpy().reshape(-1, 1)) test.append(data[1024:2048].to_numpy().reshape(-1, 1)) test.append(data[2048:3072].to_numpy().reshape(-1, 1)) test.append(data[3072:4096].to_numpy().reshape(-1, 1)) #DWTed_test = random.sample(DWTed_test, len(DWTed_test)) #test = random.sample(test, len(test)) """EEG signals classification using the K-means clustering and a multilayer perceptron neural network model (Umut Orhan 2011) """ #K-means clustering: model = TimeSeriesKMeans(n_clusters=2, metric="softdtw", max_iter = 5) model.fit(np.array(train)) pred = model.predict(np.array(test)) pred a = np.zeros((320,), dtype=int) b = np.ones((80,), dtype=int) true = np.concatenate([a, b]) confusion_matrix(true, pred) centers = model.cluster_centers_ centers = np.array([centers[0].flatten(), centers[1].flatten()]) centers plt.plot(centers[0], color = 'red')
y_train = data_train[:, 0].astype(np.int) X_test = to_time_series_dataset(data_test[:, 1:]) y_test = data_test[:, 0].astype(np.int) X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.) \ .fit_transform(X_train) X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.) \ .fit_transform(X_test) classes = len(np.unique(data_train[:, 0])) km = TimeSeriesKMeans(n_clusters=5, max_iter=10, n_init=10, metric="euclidean", verbose=0, random_state=2019) km.fit(X_train) print(i, file=f) preds = km.predict(X_train) ars = adjusted_rand_score(data_train[:, 0], preds) print("Adjusted Rand Index on Training Set:", ars, file=f) kMeansDF.loc[i, "Train ARS"] = ars preds_test = km.predict(X_test) ars = adjusted_rand_score(data_test[:, 0], preds_test) print("Adjusted Rand Index on Test Set:", ars, file=f) kMeansDF.loc[i, "Test ARS"] = ars print(file=f) kMeansTime = timer.elapsed_time() print("Time to Run k-Means Experiment in Minutes:",
def time_clustering(state_df, day_0, days_before=30, date_col='date', region_col='county', target_col='cases'): """ input: state_df: pandas dataframe that contains the data day_0: int first day of prediction days_before: how many days before day_0, you will use data for time clustering output: clusters: list of lists of clusters """ cluster_state_df = state_df.copy() cluster_state_df['GrowthRate'] = ( state_df.groupby(region_col)[target_col].shift(0) / state_df.groupby(region_col)[target_col].shift(1) - 1) cluster_state_df = get_in_date_range(cluster_state_df, first_date=mod_date( day_0, -days_before), last_date=mod_date(day_0, 0), date_col=date_col) cluster_state_df = cluster_state_df.loc[:, cluster_state_df.columns. intersection([ region_col, date_col, 'GrowthRate' ])] cluster_state_df = cluster_state_df[ ~cluster_state_df.isin([np.nan, np.inf, -np.inf]).any(1)].copy( deep=True) time_series = cluster_state_df.groupby(region_col)['GrowthRate'].apply( list) time_series_list = to_time_series_dataset([t for t in time_series]) regions = cluster_state_df[region_col].unique().tolist() # number_of_regions = 4 # inertias = [] # # for k in range(1,number_of_regions,1): # print("k is: ", k) # model = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=100, dtw_inertia=True, n_jobs=-1) # model.fit(time_series_list) # inertias.append(model.inertia_) # # # kn = KneeLocator(range(1,number_of_regions,1), inertias, curve='convex', direction='decreasing') # # print("Optimal value of clusters is: ", kn.knee) # # plt.xlabel('number of clusters k') # plt.ylabel('Sum of squared distances') # plt.plot(range(1,number_of_regions,1), inertias, 'bx-') # plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') model = TimeSeriesKMeans(n_clusters=2, metric="dtw", max_iter=100, dtw_inertia=True, n_jobs=-1) model.fit(time_series_list) clusters = [[] for _ in range(0, 2, 1)] for i in range(len(model.labels_)): clusters[model.labels_[i]].append(regions[i]) return clusters
import numpy as np from sklearn.metrics import adjusted_rand_score, accuracy_score, adjusted_mutual_info_score nameDataset = "Coffee" trainFeatDataset = 0.2 testPath = "./" + nameDataset + "/" + nameDataset + ".tsv" listOut, series, listOfClass, listForDTW = util.adaptTimeSeries(testPath) seedTS = util.extractFeature(listOut, series, trainFeatDataset) print("Class Found: " + str(len(seedTS.keys()))) centroid = util.getCentroid(seedTS) X_train = util.castTimeSeries(listOut) centroid = util.castTimeSeries(centroid) listCentr = [] for clust in centroid: listCentr.append(clust) X = np.array(listCentr, np.float64) model = TimeSeriesKMeans(n_clusters=len(seedTS.keys()), metric="dtw", max_iter=10, init=X) model.fit(X_train) groundTruth = [int(i) for i in list(series)] print("Labels Discovered") print(list(model.labels_)) print("Original Labels") print(groundTruth) print("Adjusted Rand Index") print(adjusted_rand_score(model.labels_, groundTruth))
def tsKMeans_simple(X, n_cluster, random_state): model=TimeSeriesKMeans(n_clusters = n_cluster, tol=1e-05, metric='euclidean', random_state=random_state) fitted_model = model.fit(X) y_pred = fitted_model.predict(X) return y_pred
def run_clustering_methods( data, n_clusters, path_fig, path_out, hist_plot, cluster_plot, ): "run clustering method on temporal distance files, and output cluster labels and a few diagnostic plots" model = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", random_state=seed) model.fit(data) os.chdir(path_fig) ax = sns.histplot(data=model.labels_, kde=True, discrete=True) ax.set(xlabel='DTW K-means clusters={}'.format(str(idx))) plt.savefig("hist-" + hist_plot + 'cluster_n-' + str(n_clusters) + ".svg", transparent=True, dpi=1200) plt.close("all") plt.figure() sz = data.shape[1] for cluster_id in range(0, max(model.labels_ + 1)): idx = model.labels_ == cluster_id data_clustered = data[np.array(idx), ] plt.subplot(3, 3, cluster_id + 1) for xx in data_clustered: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(savgol_filter(model.cluster_centers_[cluster_id].ravel(), 7, 2), "r-", linewidth=2.5) plt.xlim(0, sz) plt.ylim(0, 1.2) plt.text(0.55, 0.85, 'Cluster %d' % (cluster_id), transform=plt.gca().transAxes) plt.tight_layout() plt.savefig('nclus' + str(n_clusters) + cluster_plot + '.svg') plt.close("all") os.chdir(path_out) np.save('labels_nclus_' + str(n_clusters), model.labels_) return (model.labels_)