def main(): X1 = to_time_series_dataset(mock_dataset_muscle1) y1 = mock_labels X_train1 = X1[:-2] y_train1 = y1[:-2] X_test1 = X1[-2:] y_test1 = y1[-2:] # clf1 = KNeighborsTimeSeriesClassifier(n_neighbors=5, metric="dtw") clf1 = TimeSeriesKMeans(metric="dtw") clf1.fit(X_train1, y_train1) pred_train1 = clf1.predict(X_train1) pred_test1 = clf1.predict(X_test1) print("TRAINING SET 1") print("Prediction: " + str(pred_test1)) print("Actual: " + str(y_test1)) print("\n") X2 = to_time_series_dataset(mock_dataset_muscle2) y2 = mock_labels X_train2 = X2[:-2] y_train2 = y2[:-2] X_test2 = X2[-2:] y_test2 = y2[-2:] clf2 = TimeSeriesKMeans(metric="dtw") # clf2 = KNeighborsTimeSeriesClassifier(n_neighbors=5, metric="dtw") clf2.fit(X_train2, y_train2) pred_train2 = clf2.predict(X_train2) pred_test2 = clf2.predict(X_test2) print("TRAINING SET 2") print("Prediction: " + str(pred_test2)) print("Actual: " + str(y_test2)) print("\n") times_train = mock_times[:-2] times_test = mock_times[-2:] X_train = np.stack((pred_train1, pred_train2, times_train)).transpose() X_test = np.stack((pred_test1, pred_test2, times_test)).transpose() y_train = np.array(mock_labels[:-2]).reshape((len(X_train), )) y_test = mock_labels[-2:] sgd = SGDClassifier() sgd.fit(X_train, y_train) pred = sgd.predict(X_test) print("ENSEMBLE") print("Prediction: " + str(pred)) print("Actual: " + str(y_test)) print("Score: " + str(sgd.score(X_test, y_test)))
def get_dds_km(cl_lab, ds, z='LEV0', h0=0, h1=24, nc=4): # %% dds = get_ds_for_dtw_kmeans(cl_lab, ds, z, h0=h0, h1=h1)[C18] from tslearn.clustering import TimeSeriesKMeans from tslearn.metrics import dtw km = TimeSeriesKMeans(nc, metric='dtw', metric_params={'sakoe_chiba_radius': 4}, random_state=789) km.fit(dds.values) _ = km.cluster_centers_ for c in _: plt.plot(c) plt.show() # %% labs = km.predict(dds.values) lb = xr.zeros_like(dds['date'], dtype=int) + labs # %% dds['labs'] = lb # %% dds['labs'].reset_coords(drop=True). \ to_dataframe()['labs'].value_counts(). \ sort_index(). \ plot.bar() plt.show() # %% # dds['hour'] = xr.zeros_like(dds['time'], dtype=float) + \ # np.arange(0, 24, .5) # dds['nday'] = xr.zeros_like( # dds['date'], dtype=int) + \ # np.arange(len(dds['date'])) # dds = dds.swap_dims({'date': 'nday'}) # dds = dds.swap_dims({'time': 'hour'}) return dds, km
def get_cluster_labels(actions, x, n_clusters): km = TimeSeriesKMeans(n_clusters=n_clusters, metric='dtw').fit(x['train']) actions_split = {} for type in ['train', 'dev', 'test']: actions_split[type] = actions[actions['type'] == type] labels = km.predict(x[type]) actions_split[type].loc[:, 'label'] = labels actions = pd.concat( [actions_split[type] for type in ['train', 'dev', 'test']]) return actions
def run(): parser = cli_parser() args = parser.parse_args() nii = image.index_img(args.input, slice(0, 30)) masker = input_data.NiftiMasker() data = masker.fit_transform(nii) ds = to_time_series_dataset(data.T[::80, :]) model = TimeSeriesKMeans(n_clusters=2, metric="dtw", max_iter=15) model.fit(ds) all = to_time_series_dataset(data.T) mask = model.predict(all) mask_nii = masker.inverse_transform(mask) mask.nii.to_filename(args.output)
#clustering from tslearn.clustering import TimeSeriesKMeans, KernelKMeans, silhouette_score #fit the algorithm on train data #tune the hyperparameters possible metrics: euclidean, dtw, softdtw km_dba = TimeSeriesKMeans(n_clusters=4, metric="softdtw", max_iter=5, max_iter_barycenter=5, random_state=0).fit(multivariate_time_series_train) km_dba.cluster_centers_.shape #prediction on train data prediction_train = km_dba.fit_predict(multivariate_time_series_train, y=None) len(prediction_train) #prediction on test data prediction_test = km_dba.predict(multivariate_time_series_test) len(prediction_test) prediction_test #accuracy of the clustering on the train data silhouette_score(multivariate_time_series_train, prediction_train, metric="softdtw") #accuracy of the clustering on the test data silhouette_score(multivariate_time_series_test, prediction_test, metric="softdtw") ############################################ k=2 ######################################### #select randomly time series from first cluster
def subseqeuence_clustering(sequence, changepoints, y_label='y', norm=False): """ Clusters subsequences of time series indicated by the changepoints variable. Uses silhouette score to determine the number of clusters :param y_label: Name of y-label in plot :param norm: normlise data using MinMaxScaler :param sequence: np array of the time series :param changepoints: detected changepoints on which subseuqences are build :return: """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax sub_ids = [] x_index = [] X = [] i = 0 end_p = [len(sequence) - 1] for cp in changepoints + end_p: X.append(sequence[i:cp]) index = 'sub_' + str(i) + '_' + str(cp) sub_ids.append(index) x_index.append([x_id for x_id in range(i, cp + 1)]) i = cp # Normalize the data (y = (x - min) / (max - min)) if norm: X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(changepoints)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) print('Number of Clusters in subsequence clustering: ' + str(opt_k)) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(sub_ids, x_index, model.labels_)), columns=['metric', 'x_index', 'cluster']) cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() print('Plotting Clusters') # plot changepoints as vertical lines for cp in changepoints: plt.axvline(x=cp, ls=':', lw=2, c='0.65') # preprocessing for plotting cluster based x_scat = [] y_scat = [] cluster = [] for index, row in df_cluster.iterrows(): x_seq = row['x_index'] x_scat.extend(x_seq) y_seq = sequence[x_seq[0]:x_seq[-1] + 1] y_scat.extend(y_seq) label_seq = [row['cluster']] cluster.extend(label_seq * len(x_seq)) # plt.scatter(x_seq, y_seq, label=label_seq) # plotting cluster based x_scat = np.array(x_scat) y_scat = np.array(y_scat) for c in np.unique(cluster): i = np.where(cluster == c) plt.scatter(x_scat[i], y_scat[i], label=c) plt.legend() plt.title('Subsequence k-means Clustering') plt.xlabel('Time index') plt.ylabel(y_label) plt.show() return cluster_metrics_dict
def test_kmeans(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) km = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist(time_series.reshape((n, -1)), km.cluster_centers_.reshape((3, -1))) np.testing.assert_allclose(km.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km.labels_, km.predict(time_series)) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_dtw(time_series, km_dba.cluster_centers_) np.testing.assert_allclose(km_dba.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_dba.labels_, km_dba.predict(time_series)) km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_soft_dtw(time_series, km_sdtw.cluster_centers_) np.testing.assert_allclose(km_sdtw.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_sdtw.labels_, km_sdtw.predict(time_series)) km_nofit = TimeSeriesKMeans(n_clusters=101, verbose=False, random_state=rng).fit(time_series) assert(km_nofit._X_fit is None) X_bis = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9]]) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="softdtw", random_state=0).fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="random").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="k-means++").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", init=X_bis[:2]).fit(X_bis) # Barycenter size (nb of timestamps) # Case 1. kmeans++ / random init n, sz, d = 15, 10, 1 n_clusters = 3 time_series = rng.randn(n, sz, d) sizes_all_same_series = [sz] * n_clusters km_euc = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, init="k-means++", random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_series, [ts_size(b) for b in km_euc.cluster_centers_]) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, init="random", random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_series, [ts_size(b) for b in km_dba.cluster_centers_]) # Case 2. forced init barys = to_time_series_dataset([[1., 2., 3.], [1., 2., 2., 3., 4.], [3., 2., 1.]]) sizes_all_same_bary = [barys.shape[1]] * n_clusters # If Euclidean is used, barycenters size should be that of the input series km_euc = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, init=barys, random_state=rng) np.testing.assert_raises(ValueError, km_euc.fit, time_series) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, init=barys, random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_bary, [ts_size(b) for b in km_dba.cluster_centers_]) km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5, verbose=False, init=barys, random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_bary, [ts_size(b) for b in km_sdtw.cluster_centers_]) # A simple dataset, can we extract the correct number of clusters? time_series = to_time_series_dataset([[1, 2, 3], [7, 8, 9, 11], [.1, .2, 2.], [1, 1, 1, 9], [10, 20, 30, 1000]]) preds = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, random_state=rng).fit_predict(time_series) np.testing.assert_equal(set(preds), set(range(3))) preds = TimeSeriesKMeans(n_clusters=4, metric="dtw", max_iter=5, random_state=rng).fit_predict(time_series) np.testing.assert_equal(set(preds), set(range(4)))
def k_means_clustering(sd_log): """ k_means clustering of all features using dtw for multivariate time series :param sd_log: sd_log object :return: cluster_metrics_dict: dict with clusters as key and features as values """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax data = sd_log.data # TODO handle outliers tmp = sd_log.waiting_time data.drop(columns=[sd_log.waiting_time], inplace=True) X = [] # Get data as numpy array for col in data.columns: X.append(sd_log.get_points(col)) # Normalize the data (y = (x - min) / (max - min)) data_norm = data.copy() for column in data_norm.columns: data_norm[column] = (data_norm[column] - data_norm[column].min()) / ( data_norm[column].max() - data_norm[column].min()) X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(data.columns)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)), columns=['metric', 'cluster']) # make some helper dictionaries and lists cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() cluster_len_dict = df_cluster['cluster'].value_counts().to_dict() clusters_dropped = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] == 1 ] clusters_final = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] > 1 ] print('Plotting Clusters') fig, axs = plt.subplots(opt_k) # , figsize=(10, 5)) # fig.suptitle('Clusters') row_i = 0 # column_j = 0 # For each label there is, # plots every series with that label for cluster in cluster_metrics_dict: for feat in cluster_metrics_dict[cluster]: axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4) axs[row_i].legend(loc="best") if len(cluster_metrics_dict[cluster]) > 100: # TODO draw mean in red if more than one cluster tmp = np.nanmean(np.vstack(cluster), axis=1) axs[row_i].plot(tmp, c="red") axs[row_i].set_title("Cluster " + str(cluster)) row_i += 1 # column_j += 1 # if column_j % k == 0: # row_i += 1 # column_j = 0 plt.show() # return dict {cluster_id: features} return cluster_metrics_dict
X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.) \ .fit_transform(X_train) X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.) \ .fit_transform(X_test) classes = len(np.unique(data_train[:, 0])) km = TimeSeriesKMeans(n_clusters=5, max_iter=10, n_init=10, metric="euclidean", verbose=0, random_state=2019) km.fit(X_train) print(i, file=f) preds = km.predict(X_train) ars = adjusted_rand_score(data_train[:, 0], preds) print("Adjusted Rand Index on Training Set:", ars, file=f) kMeansDF.loc[i, "Train ARS"] = ars preds_test = km.predict(X_test) ars = adjusted_rand_score(data_test[:, 0], preds_test) print("Adjusted Rand Index on Test Set:", ars, file=f) kMeansDF.loc[i, "Test ARS"] = ars print(file=f) kMeansTime = timer.elapsed_time() print("Time to Run k-Means Experiment in Minutes:", kMeansTime / 60, file=f) kMeansDF.to_pickle(
for i in range(0,len(variance_perc)): sum(variance_perc[0:i]) if sum(variance_perc[0:i])>=90: break print ("Components accounting for <=90% of variance : " + str(i)) components=i ##--------------------------------------Cluster analysis---------------------------------------- ##for theory, see https://scikit-learn.org/stable/modules/clustering.html ##for parameters setting, https://tslearn.readthedocs.io/en/stable/gen_modules/clustering/tslearn.clustering.TimeSeriesKMeans.html # Euclidean k-means print("Euclidean k-means") km = TimeSeriesKMeans(n_clusters=components, max_iter=5,metric='euclidean',random_state=0).fit(df_array) cluster_centre=km.cluster_centers_.shape #time_series_class=km.predict(df_array_std) time_series_class=km.predict(df_array) labels = km.labels_ count_labels=list(Counter(labels).values()) inertia=km.inertia_ ##plot the % of the clusters labels_for_plot=list(Counter(labels).keys()) fig1, ax1 = plt.subplots() ax1.pie(count_labels,labels=labels_for_plot, autopct='%1.1f%%', shadow=True, startangle=90) ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. plt.title("% of points distribution per clusters"); props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) plt.text(0.75,-1,'no_of_samples='+str(len(labels)),verticalalignment='top',bbox=props) plt.show() plt.savefig('Clusters_%_distribution.png')
# plt.show() # In[6]: # from scipy.spatial.distance import cdist from tslearn.clustering import TimeSeriesKMeans km = TimeSeriesKMeans(n_clusters=3, metric="dtw",max_iter = 900,tol = 1e-08,random_state=3) km.fit(X_train) # In[7]: predictions = km.predict(X_train) for c in range(3): c_0 = np.argwhere(predictions==c) print(c_0.shape[0],end=' ') c_assign = np.zeros(32) for k in range(3): c_0 = np.argwhere(predictions==k) c_assign[c_0] = k # print(k,c_0) print(c_assign) # In[8]: import matplotlib.pyplot as plt
def test_kmeans(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) km = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist(time_series.reshape((n, -1)), km.cluster_centers_.reshape((3, -1))) np.testing.assert_allclose(km.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km.labels_, km.predict(time_series)) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_dtw(time_series, km_dba.cluster_centers_) np.testing.assert_allclose(km_dba.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_dba.labels_, km_dba.predict(time_series)) km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_soft_dtw(time_series, km_sdtw.cluster_centers_) np.testing.assert_allclose(km_sdtw.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_sdtw.labels_, km_sdtw.predict(time_series)) km_nofit = TimeSeriesKMeans(n_clusters=101, verbose=False, random_state=rng).fit(time_series) assert (km_nofit._X_fit is None) X_bis = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9]]) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="softdtw", random_state=0).fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="random").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="k-means++").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", init=X_bis[:2]).fit(X_bis)
km_dba4 = TimeSeriesKMeans(n_clusters=4, metric="dtw", max_iter=5, max_iter_barycenter=5,random_state=0).fit(X) km_dba5 = TimeSeriesKMeans(n_clusters=5, metric="dtw", max_iter=5, max_iter_barycenter=5,random_state=0).fit(X) # In[11]: km_sdtw3 = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5,max_iter_barycenter=5,metric_params={"gamma": .5},random_state=0).fit(X) km_sdtw4 = TimeSeriesKMeans(n_clusters=4, metric="softdtw", max_iter=5,max_iter_barycenter=5,metric_params={"gamma": .5},random_state=0).fit(X) km_sdtw5 = TimeSeriesKMeans(n_clusters=5, metric="softdtw", max_iter=5,max_iter_barycenter=5,metric_params={"gamma": .5},random_state=0).fit(X) # In[12]: km5_p = km5.predict(X) km3_p = km3.predict(X) km_dba3_p = km_dba3.predict(X) km_dba4_p = km_dba4.predict(X) km_dba5_p = km_dba5.predict(X) km_sdtw3_p = km_sdtw3.predict(X) km_sdtw4_p = km_sdtw4.predict(X) km_sdtw5_p = km_sdtw5.predict(X) # In[15]: l0 = X[np.where(km5_p == 0)]
test_result = test_windows['result'] if distanceMatrix == 'eucl': model = TimeSeriesKMeans(n_clusters=kVal, n_init=10).fit(train_data_without_attacks.values) elif distanceMatrix == 'dtw': model = TimeSeriesKMeans(n_clusters=kVal, metric='dtw', n_init=10).fit(train_data_without_attacks.values) df = pd.DataFrame() df['result'] = test_result df['sr_value'] = -1 df['prediction'] = -1 for i in range(len(test_data)): pred = model.predict([test_data.loc[i].values])[0] closest_centroid = list(itertools.chain(*model.cluster_centers_[pred])) residual = test_data.loc[i].values - closest_centroid trans = fft(residual) magnitudes = np.sqrt(trans.real**2 + trans.imag**2) eps_index = np.where(magnitudes <= EPS)[0] magnitudes[eps_index] = EPS mag_log = np.log(magnitudes) mag_log[eps_index] = 0 spectral = np.exp(mag_log - average_filter(mag_log, n=48)) trans.real = trans.real * spectral / magnitudes trans.imag = trans.imag * spectral / magnitudes
test.append(data[0:1000].to_numpy().reshape(-1, 1)) test.append(data[1024:2048].to_numpy().reshape(-1, 1)) test.append(data[2048:3072].to_numpy().reshape(-1, 1)) test.append(data[3072:4096].to_numpy().reshape(-1, 1)) #DWTed_test = random.sample(DWTed_test, len(DWTed_test)) #test = random.sample(test, len(test)) """EEG signals classification using the K-means clustering and a multilayer perceptron neural network model (Umut Orhan 2011) """ #K-means clustering: model = TimeSeriesKMeans(n_clusters=2, metric="softdtw", max_iter = 5) model.fit(np.array(train)) pred = model.predict(np.array(test)) pred a = np.zeros((320,), dtype=int) b = np.ones((80,), dtype=int) true = np.concatenate([a, b]) confusion_matrix(true, pred) centers = model.cluster_centers_ centers = np.array([centers[0].flatten(), centers[1].flatten()]) centers plt.plot(centers[0], color = 'red') for dataset in [Z, O]: for i in range(1):
#print(my_array[500,3]) print(centroids) print(len(labels))''' no_clust = 10 t_series = to_time_series(my_array) kmeans = TimeSeriesKMeans(n_clusters=no_clust, metric="euclidean", max_iter=8, random_state=0) kmeans.fit(t_series) print("The cluster centers are:", kmeans.cluster_centers_) print("Each time series belongs to:", kmeans.labels_) labels = kmeans.labels_ y_kmeans = kmeans.predict(t_series) plt.scatter(t_series[:, 0, 1], [2 for _ in range(length)], c=y_kmeans, s=30, cmap='viridis') plt.scatter(t_series[:, 182, 1], [1.5 for _ in range(length)], c=y_kmeans, s=30, cmap='viridis') plt.scatter(t_series[:, 364, 1], [1 for _ in range(length)], c=y_kmeans, s=30, cmap='viridis') plt.show() plt.scatter([i for i in range(3 * 365)], t_series[0, :, 3],