def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False
def _func(self, data: np.ndarray, img_info_path: str, roi_state: dict) -> np.ndarray: if 'raw_min_max' in roi_state.keys(): raw_min_max = roi_state['raw_min_max'] else: cnmf_idx = roi_state['cnmf_idx'] img_info_path = os.path.join(self.proj_path, img_info_path) roi_states = pickle.load(open(img_info_path, 'rb'))['roi_states'] idx_components = roi_states['cnmf_output']['idx_components'] list_ix = np.argwhere(idx_components == cnmf_idx).ravel().item() state = roi_states['states'][list_ix] if not state['cnmf_idx'] == cnmf_idx: raise ValueError( 'cnmf_idx from ImgInfoPath dict and DataFrame ROI_State dict do not match.' ) raw_min_max = state['raw_min_max'] raw_min = raw_min_max['raw_min'][self.option] raw_max = raw_min_max['raw_max'][self.option] if raw_min >= raw_max: self.excluded += 1 return np.NaN return TimeSeriesScalerMinMax( value_range=(raw_min, raw_max)).fit_transform(data).ravel()
def tsclusteringN(ts_data, names): # クラスタリング # 正規化 ts_dataset = TimeSeriesScalerMinMax().fit_transform(ts_data) metric = 'dtw' n_clusters = [n for n in range(2, 6)] for n in n_clusters: print('クラスター数 =', n) # metricが「DTW」か「softdtw」なら異なるデータ数の時系列データでもOK km = TimeSeriesKMeans(n_clusters=n, metric=metric, verbose=False, random_state=1).fit(ts_dataset) # クラスタリングの結果 print('クラスタリング結果 =', km.labels_) # -1から1の範囲の値。シルエット値が1に近く、かつシルエット値をプロットしたシルエット図でクラスター間の幅の差が最も少ないクラスター数が最適 # 今回はシルエット値のみを確認 print('シルエット値 =', silhouette_score(ts_dataset, km.labels_, metric=metric)) print()
def __ScaleData(self, input_data): ''' scale input data to range [0,1] parameters: input_data : input data to rescale ''' return TimeSeriesScalerMinMax().fit_transform(input_data)
def dataImport(name): if not os.path.exists("../Classifier/TimeSeriesFiles/" + name): url = "http://www.timeseriesclassification.com/Downloads/%s.zip" % name extract_from_zip_url(url, "../Classifier/TimeSeriesFiles/" + name + "/", verbose=False) data_train = numpy.loadtxt("../Classifier/TimeSeriesFiles/" + name + "/" + name + "_TRAIN.txt") data_test = numpy.loadtxt("../Classifier/TimeSeriesFiles/" + name + "/" + name + "_TEST.txt") X_train = to_time_series_dataset(data_train[:, 1:]) y_train = data_train[:, 0].astype(numpy.int) X_test = to_time_series_dataset(data_test[:, 1:]) y_test = data_test[:, 0].astype(numpy.int) X_train = TimeSeriesScalerMinMax().fit_transform(X_train) X_test = TimeSeriesScalerMinMax().fit_transform(X_test) return X_train, y_train, X_test, y_test
def __generateRefPrice(self, curPrice, seedPrice, priceRange): priceMin = min(curPrice, seedPrice / 1.05 * (1 + numpy.random.uniform(-priceRange * 0.1, priceRange * 0.4))) priceMax = max(curPrice, seedPrice * 1.05 * (1 + numpy.random.uniform(-priceRange * 0.4, priceRange * 0.1))) data_len = numpy.random.randint(10000, 30000) # assert curPrice>=priceMin and curPrice<=priceMax,f"error: {curPrice}, {priceMin}, {priceMax}" def smooth_data(data): x = numpy.arange(0, len(data), 1) x_new = numpy.arange(0, max(x), 0.01) func = interpolate.interp1d(x, data, kind='quadratic') smoothed = func(x_new) return smoothed while True: dataset = random_walks(n_ts=1, sz=data_len * 2) scaler = TimeSeriesScalerMinMax(min=float(priceMin), max=float(priceMax)) dataset_scaled = scaler.fit_transform(dataset)[0, :, 0] for i in range(0, data_len): if abs(dataset_scaled[i] - curPrice) / curPrice < 0.001: # return list(smooth_data(dataset_scaled[i:i+data_len])) with open('price.txt', 'w+') as f: f.writelines([f'{p}\n' for p in dataset_scaled[i:i + data_len]]) return list(dataset_scaled[i:i + data_len])
def _preprocess_series(self, X): if self.scale: X = TimeSeriesScalerMinMax().fit_transform(X) else: X = to_time_series_dataset(X) if self.max_size is not None and self.max_size != X.shape[1]: if X.shape[1] > self.max_size: raise ValueError("Cannot feed model with series of length {} " "max_size is {}".format( X.shape[1], self.max_size)) X_ = numpy.zeros((X.shape[0], self.max_size, X.shape[2])) X_[:, :X.shape[1]] = X X_[:, X.shape[1]:] = numpy.nan return X_ else: return X
def tsclustering(ts_data, names): # 正規化 ts_dataset = TimeSeriesScalerMinMax().fit_transform(ts_data) n_clusters = 2 metric = 'dtw' # metricが「DTW」か「softdtw」なら異なるデータ数の時系列データでもOK km = TimeSeriesKMeans(n_clusters=n_clusters, metric=metric, verbose=False, random_state=1).fit(ts_dataset) # クラスタリングの結果 print('クラスタリング結果 =', km.labels_) plot_clustering(km, ts_dataset, names, n_clusters)
# get current working directory working_dir_path = Path.cwd() sys.path.append(str(working_dir_path)) # Load the dataset raw_data = pd.read_csv(os_path.join(working_dir_path, "./data/train_curves.csv"), header=None) time_series_train = to_time_series_dataset(raw_data) labels_train = genfromtxt(os_path.join( working_dir_path, "./data/train_clustering_result.csv"), delimiter=',') # Normalize the time series time_series_train = TimeSeriesScalerMinMax().fit_transform( time_series_train) # Get dimensions of the dataset n_time_series, time_series_size = time_series_train.shape[:2] n_classes = len(set(labels_train)) # We will extract 2 shapelets and align them with the time series shapelet_sizes = {10: 2} # Define the model shapelet_classification_model = LearningShapelets( n_shapelets_per_size=shapelet_sizes, weight_regularizer=0.0001, optimizer=Adam(lr=0.01), max_iter=300, verbose=1,
def k_means_clustering(sd_log): """ k_means clustering of all features using dtw for multivariate time series :param sd_log: sd_log object :return: cluster_metrics_dict: dict with clusters as key and features as values """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax data = sd_log.data # TODO handle outliers tmp = sd_log.waiting_time data.drop(columns=[sd_log.waiting_time], inplace=True) X = [] # Get data as numpy array for col in data.columns: X.append(sd_log.get_points(col)) # Normalize the data (y = (x - min) / (max - min)) data_norm = data.copy() for column in data_norm.columns: data_norm[column] = (data_norm[column] - data_norm[column].min()) / ( data_norm[column].max() - data_norm[column].min()) X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(data.columns)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)), columns=['metric', 'cluster']) # make some helper dictionaries and lists cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() cluster_len_dict = df_cluster['cluster'].value_counts().to_dict() clusters_dropped = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] == 1 ] clusters_final = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] > 1 ] print('Plotting Clusters') fig, axs = plt.subplots(opt_k) # , figsize=(10, 5)) # fig.suptitle('Clusters') row_i = 0 # column_j = 0 # For each label there is, # plots every series with that label for cluster in cluster_metrics_dict: for feat in cluster_metrics_dict[cluster]: axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4) axs[row_i].legend(loc="best") if len(cluster_metrics_dict[cluster]) > 100: # TODO draw mean in red if more than one cluster tmp = np.nanmean(np.vstack(cluster), axis=1) axs[row_i].plot(tmp, c="red") axs[row_i].set_title("Cluster " + str(cluster)) row_i += 1 # column_j += 1 # if column_j % k == 0: # row_i += 1 # column_j = 0 plt.show() # return dict {cluster_id: features} return cluster_metrics_dict
def main(argv): # define global timer to obtain global execution time start_global = timer() # define globals variables global euclidean_clustered_data, \ dtw_clustered_data, \ soft_dtw_clustered_data, \ k_shape_clustered_data, \ gak_clustered_data ############################################################################################# # Input arguments parsing ############################################################################################# # define help message help_message = \ 'clustering.py -h \n\n' \ 'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \ 'by default: processing input data (without any sampling)' \ '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \ 'options list: \n' \ ' -c / --clusters <number_clusters> # set number of clusters (default 3) \n\n' \ ' -i / --ifile <input_file> # set input filename \n' \ ' -n / --normalise # normalise input data \n' \ ' -s / --standardise # standardise input data \n\n' \ ' -a / --all # perform all 5 implemented methods of clustering: \n' \ ' euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \ ' -E / --euclidean # perform euclidean k-means clustering \n' \ ' -D / --dtw # perform dtw k-means clustering \n' \ ' -S / --soft-dtw # perform soft-dtw k-means clustering \n' \ ' -K / --k-shape # perform k-shape clustering \n' \ ' -G / --gak # perform GAK k-means clustering \n' # Create new object to save arguments i_args = Arguments() # number of rows in plot to create correct number of subplots # default = 3 (raw data plus distribution histograms) n_rows_plot = 3 # define validation rules for arguments try: opts, args = getopt.getopt( argv, "hc:i:nsaEDSKG", [ "help", "clusters=", "ifile=", "normalise", "standardise", "all", "euclidean", "dtw", "soft-dtw", "k-shape", "gak" ] ) except getopt.GetoptError: print(help_message) sys.exit(2) # parse arguments for opt, arg in opts: if opt in ("-h", "--help"): print(help_message) sys.exit() elif opt in ("-c", "--clusters"): i_args.number_clusters = arg elif opt in ("-i", "--ifile"): i_args.input_file = arg elif opt in ("-n", "--normalise"): i_args.normalise_data = True elif opt in ("-s", "--standardise"): i_args.standardise_data = True elif opt in ("-E", "--euclidean"): n_rows_plot += 1 i_args.euclidean_clustering = True elif opt in ("-D", "--dtw"): n_rows_plot += 1 i_args.dtw_clustering = True elif opt in ("-S", "--soft-dtw"): n_rows_plot += 1 i_args.soft_dtw_clustering = True elif opt in ("-K", "--k-shape"): n_rows_plot += 1 i_args.k_shape_clustering = True elif opt in ("-G", "--gak"): n_rows_plot += 1 i_args.gak_clustering = True elif opt in ("-a", "--all"): n_rows_plot = 8 i_args.euclidean_clustering = True i_args.dtw_clustering = True i_args.soft_dtw_clustering = True i_args.k_shape_clustering = True i_args.gak_clustering = True # normalise maximum number of subplots levels n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot ############################################################################################# # Raw data processing stage ############################################################################################# # set style to matplotlib plot mpl.style.use('seaborn') # set seed value and seed the generator seed = 0 numpy.random.seed(seed) # import data and print first 5 rows raw_data = import_data() print(raw_data.head()) # convert raw data to the format which can be used by tslearn # (3-d dimensional array) # BUILT functionality: adjust all time series to one size # (NaN values are appended to the shorter ones) formatted_data = to_time_series_dataset(raw_data) # print shape of new array print(formatted_data.shape) # obtain number of measuring n_measuring = formatted_data.shape[1] # define figure, grid_spec to create layout of the plot fig = plt.figure(constrained_layout=True) grid_spec = fig.add_gridspec( n_rows_plot, i_args.number_clusters ) # set A4 size to figure fig.set_size_inches(8.5, 11.75) # setup count of layers of subplots count_layer = 3 # setup first subplot and draw raw time series f_ax_raw_data = fig.add_subplot(grid_spec[:2, :]) for xx in formatted_data: f_ax_raw_data.plot(xx.ravel(), alpha=.2) formatted_data_min = formatted_data.min() formatted_data_max = formatted_data.max() # draw title for chart with min and max values f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max)) # obtain and print executing time of data processing stage to console, timer_tick = get_time_tick(start_global) plt.ion() plt.show() print("Raw data processing time: %s" % timer_tick) ############################################################################################# # Data preprocessing stage ############################################################################################# start = timer() # Convert NaNs to value predicted by interpolation # linearly interpolate for NaN/NaNs n_nan_changes = 0 for ind in range(formatted_data.shape[0]): mask = numpy.isnan(formatted_data[ind]) n_nan_changes += mask.sum() formatted_data[ind][mask] = numpy.interp( numpy.flatnonzero(mask), numpy.flatnonzero(~mask), formatted_data[ind][~mask] ) print("%d NaN values was/were interpolated" % n_nan_changes) # Scaling # to know should we use normalization or standardization, we need to see # the distribution of values. # take random 3 measuring for each case to draw histograms random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False) # create new arrays with values of randomly chosen measurements histogram_data = formatted_data[:, random_indexes] # draw histograms for i_histogram in range(i_args.number_clusters): f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram]) f_ax_histogram.hist( histogram_data[:, i_histogram], bins=25, density=True ) f_ax_histogram.text(0.55, 0.98, 'Measurement #%d' % random_indexes[i_histogram], transform=plt.gca().transAxes, color="navy" ) if i_histogram == 1: preprocessing = '' if i_args.normalise_data: preprocessing += "normalised" if i_args.standardise_data: preprocessing += " and standardised" elif i_args.standardise_data: preprocessing += "standardised" preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing f_ax_histogram.set_title( "Distributions histograms %s" % preprocessing, color='navy', y=1, pad=14 ) # if no processing data option chosen continue with raw data processed_data = formatted_data # since for this concrete challenge data the distributions are more/less # Gaussian/Normal we can use standardization # normalize data: Min-Max scaling ranging between 0 and 1 if i_args.normalise_data: processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data) print("Data was normalised") # standardize data: scaling technique where the values are centered around # the mean with a unit standard deviation if i_args.standardise_data: processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data) print("Data was standardised") # obtain max value of data (to be used in visualization subplots) max_data = processed_data.max() * 1.2 min_data = processed_data.min() * 1.2 timer_tick = get_time_tick(start) print("#############################################################################################") print("Data processing stage elapsed time: %s" % timer_tick) ############################################################################################# # Implementing Euclidean k-means clustering algorithm ############################################################################################# if i_args.euclidean_clustering: start = timer() print("Euclidean k-means") # define parameters of the model of the algorithm k_means_euclidean = TimeSeriesKMeans( n_clusters=i_args.number_clusters, verbose=True, random_state=seed, n_jobs=4 ) # calculate cluster's label array euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data) # draw subplots with attributed clusters of time series as well as # cluster centers' lines for i_cluster in range(i_args.number_clusters): f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, euclidean_clustered_data, 'tab:blue') f_ax_euclidean.plot( k_means_euclidean.cluster_centers_[i_cluster].ravel(), "tab:green" ) if i_cluster == 1: middle_axis = f_ax_euclidean # increment count of filled layer of subplots count_layer += 1 # obtain processing time, print it to console and # add it to the title of the series of subplots timer_tick = get_time_tick(start) middle_axis.set_title( "Euclidean $k$-means (%s)" % timer_tick, color='tab:green', y=1, pad=14 ) print("#############################################################################################") print("Euclidean k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing DTW k-means clustering algorithm # use dtw (Dynamic Time Warping Distance) metric to calculate # distance between means ############################################################################################# if i_args.dtw_clustering: start = timer() print("DTW k-means") k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters, n_init=3, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed, n_jobs=6 ) dtw_clustered_data = k_means_DTW.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, dtw_clustered_data, 'tab:blue') f_ax_dtw.plot( k_means_DTW.cluster_centers_[i_cluster].ravel(), "tab:red" ) if i_cluster == 1: middle_axis = f_ax_dtw # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "DTW $k$-means (%s)" % timer_tick, color='tab:red', y=1, pad=14 ) print("#############################################################################################") print("DTW k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing soft DTW k-means clustering algorithm # use soft dtw (Dynamic Time Warping Distance) metric to calculate # distance between means ############################################################################################# if i_args.soft_dtw_clustering: start = timer() print("Soft-DTW k-means") k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters, metric="softdtw", metric_params={"gamma": .025}, verbose=True, random_state=seed, n_jobs=6 ) soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, soft_dtw_clustered_data, 'tab:blue') f_ax_soft_dtw.plot( k_means_soft_DTW.cluster_centers_[i_cluster].ravel(), "tab:purple" ) if i_cluster == 1: middle_axis = f_ax_soft_dtw # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "Soft-DTW $k$-means (%s)" % timer_tick, color='tab:purple', y=1, pad=14 ) print("#############################################################################################") print("Soft-DTW k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing k-Shape clustering algorithm ############################################################################################# if i_args.k_shape_clustering: start = timer() print("K-Shape") k_shape = KShape(n_clusters=i_args.number_clusters, verbose=True, random_state=seed ) k_shape_clustered_data = k_shape.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min()) max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max()) f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_axe_value, max_axe_value, processed_data, k_shape_clustered_data, 'tab:blue') f_ax_k_shape.plot( k_shape.cluster_centers_[i_cluster].ravel(), "tab:orange" ) if i_cluster == 1: middle_axis = f_ax_k_shape # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "$K$-Shape (%s)" % timer_tick, color='tab:orange', y=1, pad=14 ) print("#############################################################################################") print("K-Shape time processing: %s" % timer_tick) ############################################################################################# # Implementing Global Alignment kernel k-means clustering algorithm # since kernel is used, there is no centroid of the cluster ############################################################################################# if i_args.gak_clustering: start = timer() print("GAK-k-means") gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters, kernel="gak", kernel_params={"sigma": "auto"}, n_init=10, verbose=True, random_state=seed, n_jobs=6 ) gak_clustered_data = gak_k_means.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, gak_clustered_data, 'tab:blue') if i_cluster == 1: middle_axis = f_ax_gak_k_means # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "Global Alignment kernel $k$-means (%s)" % timer_tick, color='tab:cyan', y=1, pad=14) print("#############################################################################################") print("GAK k-means time processing: %s" % timer_tick) ############################################################################################# # return string with current datetime now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") # define the name of the directory to be created path = "./out/%s" % now print("#############################################################################################") try: os.mkdir(path) except OSError: print("Creation of the directory %s failed" % path) else: print("Successfully created the directory %s " % path) try: # save figure as pdf to out folder fig.savefig("./out/%s/visual_result.pdf" % now) # save clustering results if i_args.euclidean_clustering: numpy.savetxt( "./out/%s/euclidean_clustering_result.csv" % now, euclidean_clustered_data, delimiter="," ) if i_args.dtw_clustering: numpy.savetxt( "./out/%s/dtw_clustering_result.csv" % now, dtw_clustered_data, delimiter="," ) if i_args.soft_dtw_clustering: numpy.savetxt( "./out/%s/soft_dtw_clustering_result.csv" % now, soft_dtw_clustered_data, delimiter="," ) if i_args.k_shape_clustering: numpy.savetxt( "./out/%s/k_shape_clustering_result.csv" % now, k_shape_clustered_data, delimiter="," ) if i_args.gak_clustering: numpy.savetxt( "./out/%s/gak_clustering_result.csv" % now, gak_clustered_data, delimiter="," ) except RuntimeError: print("Saving results failed") else: print("Successfully saved results in the path %s " % path) ############################################################################################# # obtain and print global executing time timer_tick = get_time_tick(start_global) print("#############################################################################################") print("All algorithms elapsed time: % s" % timer_tick) ############################################################################################# # render and show plot # plt.show() plt.draw() plt.pause(0.001) input("Press [enter] to finish.") print("#############################################################################################")
def normalize(ts, ts_err): ts /= (ts_err + 1) # +1 to avoid zero division ts = np.nan_to_num(ts) ts = TimeSeriesScalerMinMax().fit_transform(ts) return ts
# if data[i]<minVal: # data[i] = minVal # elif data[i]>maxVal: # data[i] = maxVal # #转成tslearn格式 import time data = ReadDataFromFile() from tslearn.utils import to_time_series_dataset formatted_dataset = to_time_series_dataset(list(data.values())) #归一化 from tslearn.preprocessing import TimeSeriesScalerMinMax scaler = TimeSeriesScalerMinMax(value_range=(0., 1.)) storeMinMax = [] for i in range(len(formatted_dataset)): ele = formatted_dataset[i] ele = ele.reshape(ele.shape[0]) storeMinMax.append((min(ele), max(ele))) formatted_dataset[i] = scaler.fit_transform(ele).reshape( formatted_dataset[i].shape[0], formatted_dataset[i].shape[1]) #进行聚类 ## 目标:将已有的时序数据分成若干类,每个类中有一个中心变量,通过对中心变量的预测可以实现对其他时序数据的预测,从而降低时序数据预测的开销 ## 暂时不考虑时序上的位移,只考虑数值上每个类与该类之间存在固定的线性变化,这样通过计算线性关系的算法可以直接算出来 # T1:直接对原始数据进行基于切分的聚类时不可行的,速度太慢。 # 想法一:进行普通归一化。然后以欧式距离/相关性进行聚类,聚类方法为普通K-means递进/DBSCAN/层次聚类
from tslearn.neighbors import KNeighborsTimeSeriesClassifier from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.datasets import CachedDatasets from sklearn.model_selection import GridSearchCV, StratifiedKFold from sklearn.pipeline import Pipeline import numpy as np import matplotlib.pyplot as plt # Our pipeline consists of two phases. First, data will be normalized using # min-max normalization. Afterwards, it is fed to a KNN classifier. For the # KNN classifier, we tune the n_neighbors and weights hyper-parameters. n_splits = 3 pipeline = GridSearchCV(Pipeline([('normalize', TimeSeriesScalerMinMax()), ('knn', KNeighborsTimeSeriesClassifier())]), { 'knn__n_neighbors': [5, 25], 'knn__weights': ['uniform', 'distance'] }, cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)) X_train, y_train, _, _ = CachedDatasets().load_dataset("Trace") # Keep only timeseries of class 1, 2, 3 X_train = X_train[y_train > 0] y_train = y_train[y_train > 0]
from sklearn.pipeline import Pipeline from tslearn.generators import random_walk_blobs from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.neighbors import KNeighborsTimeSeriesClassifier, KNeighborsTimeSeries from tslearn.piecewise import SymbolicAggregateApproximation numpy.random.seed(0) n_ts_per_blob, sz, d, n_blobs = 20, 100, 1, 2 # Prepare data X, y = random_walk_blobs(n_ts_per_blob=n_ts_per_blob, sz=sz, d=d, n_blobs=n_blobs) scaler = TimeSeriesScalerMinMax(min=0., max=1.) # Rescale time series X_scaled = scaler.fit_transform(X) indices_shuffle = numpy.random.permutation(n_ts_per_blob * n_blobs) X_shuffle = X_scaled[indices_shuffle] y_shuffle = y[indices_shuffle] X_train = X_shuffle[:n_ts_per_blob * n_blobs // 2] X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:] y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2] y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:] # Nearest neighbor search knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw") knn.fit(X_train, y_train) dists, ind = knn.kneighbors(X_test)
def row_wise_minmax_scaling(x): ''' Takes a 2D array and scales each row to the range of [0.0, 1.0] ''' scaler = TimeSeriesScalerMinMax(value_range=(0.0, 1.0)) return (scaler.fit_transform(x).squeeze())
except RuntimeError as e: print(e) # Set a seed to ensure determinism numpy.random.seed(42) # Load the Trace dataset X_train, y_train, _, _ = CachedDatasets().load_dataset("Trace") # Filter out classes 2 and 4 mask = numpy.isin(y_train, [1, 3]) X_train = X_train[mask] y_train = y_train[mask] # Normalize the time series X_train = TimeSeriesScalerMinMax().fit_transform(X_train) # Get statistics of the dataset n_ts, ts_sz = X_train.shape[:2] n_classes = len(set(y_train)) # We will extract 1 shapelet and align it with a time series shapelet_sizes = {20: 1} # Define the model and fit it using the training data shp_clf = LearningShapelets(n_shapelets_per_size=shapelet_sizes, weight_regularizer=0.001, optimizer=Adam(lr=0.01), max_iter=250, verbose=0, scale=False,
from tslearn.generators import random_walk_blobs from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.neighbors import KNeighborsTimeSeriesClassifier, \ KNeighborsTimeSeries from tslearn.piecewise import SymbolicAggregateApproximation numpy.random.seed(0) n_ts_per_blob, sz, d, n_blobs = 20, 100, 1, 2 # Prepare data X, y = random_walk_blobs(n_ts_per_blob=n_ts_per_blob, sz=sz, d=d, n_blobs=n_blobs) scaler = TimeSeriesScalerMinMax(value_range=(0., 1.)) # Rescale time series X_scaled = scaler.fit_transform(X) indices_shuffle = numpy.random.permutation(n_ts_per_blob * n_blobs) X_shuffle = X_scaled[indices_shuffle] y_shuffle = y[indices_shuffle] X_train = X_shuffle[:n_ts_per_blob * n_blobs // 2] X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:] y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2] y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:] # Nearest neighbor search knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw") knn.fit(X_train, y_train) dists, ind = knn.kneighbors(X_test)
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies the k nearest neighbor classification algorithm to time series data. The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped. Training inputs: 1) Feature dataframe, 2) Target dataframe Outputs: Dataframe with predictions for specific time series at specific future time instances Arguments: hyperparams {Hyperparams} -- D3M Hyperparameter object Keyword Arguments: random_seed {int} -- random seed (default: {0}) """ metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "2d6d3223-1b3c-49cc-9ddd-50f571818268", "version": __version__, "name": "kanine", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": [ "time series", "knn", "k nearest neighbor", "time series classification", ], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/Yonder-OSS/D3M-Primitives", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.14" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/Yonder-OSS/D3M-Primitives.git@{git_commit}#egg=yonder-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(scaler=None, classifier=None, output_columns=None) return Params(scaler=self._scaler, classifier=self._knn, output_columns=self._output_columns) def set_params(self, *, params: Params) -> None: self._scaler = params['scaler'] self._knn = params['classifier'] self._output_columns = params['output_columns'] self._is_fit = all(param is not None for param in params.values()) def _get_cols(self, input_metadata): """ private util function that finds grouping column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: list[int] -- list of column indices annotated with GroupingKey metadata """ # find column with ts value through metadata grouping_column = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) return grouping_column def _get_value_col(self, input_metadata): """ private util function that finds the value column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: int -- index of column that contains time series value after Time Series Formatter primitive """ # find attribute column but not file column attributes = input_metadata.list_columns_with_semantic_types( ('https://metadata.datadrivendiscovery.org/types/Attribute', )) # this is assuming alot, but timeseries formaters typicaly place value column at the end attribute_col = attributes[-1] return attribute_col def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets """ # load and reshape training data self._output_columns = outputs.columns outputs = np.array(outputs) n_ts = outputs.shape[0] ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) self._X_train = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) self._y_train = np.array(outputs).reshape(-1, ) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fits KNN model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ scaled = self._scaler.fit_transform(self._X_train) self._knn.fit(scaled, self._y_train) self._is_fit = True return CallResult(None, has_finished=self._is_fit) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) # make predictions scaled = self._scaler.transform(x_vals) preds = self._knn.predict(scaled) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=True)
def get_color(weights): baselines = numpy.zeros((4, 3)) weights = numpy.array(weights).reshape(1, 4) for i, c in enumerate(["r", "g", "b", "y"]): baselines[i] = matplotlib.colors.ColorConverter().to_rgb(c) return numpy.dot(weights, baselines).ravel() numpy.random.seed(0) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_out = numpy.empty((4, X_train.shape[1], X_train.shape[2])) plt.figure() for i in range(4): X_out[i] = X_train[y_train == (i + 1)][0] X_out = TimeSeriesScalerMinMax().fit_transform(X_out) for i, pos in enumerate([1, 5, 21, 25]): plt.subplot(5, 5, pos) w = [0.] * 4 w[i] = 1. plt.plot(X_out[i].ravel(), color=matplotlib.colors.rgb2hex(get_color(w)), linewidth=2) plt.text(X_out[i].shape[0], 0., "$X_%d$" % i, horizontalalignment="right", verticalalignment="baseline", fontsize=24) plt.xticks([]) plt.yticks([])
""" from __future__ import print_function # Author: Romain Tavenard # License: BSD 3 clause import numpy import matplotlib.pyplot as plt from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.svm import TimeSeriesSVC numpy.random.seed(0) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = TimeSeriesScalerMinMax().fit_transform(X_train) X_test = TimeSeriesScalerMinMax().fit_transform(X_test) clf = TimeSeriesSVC(kernel="gak", gamma=.1, sz=X_train.shape[1], d=X_train.shape[2]) clf.fit(X_train, y_train) print(("Correct classification rate:", clf.score(X_test, y_test))) n_classes = len(set(y_train)) plt.figure() support_vectors = clf.support_vectors_time_series_(X_train) for i, cl in enumerate(set(y_train)): plt.subplot(n_classes, 1, i + 1)
len(first_train) from toolz.itertoolz import sliding_window, partition #for every day of the train set store the flow observations days_first=list(partition(48,first_train)) days_first len(days_first) #from list to multidimensional array days_first=np.asarray(days_first) days_first from tslearn.utils import to_time_series, to_time_series_dataset #create univariate series for normalized flow_observation first_time_series = to_time_series(days_first) print(first_time_series.shape) #normalize time series from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesScalerMinMax first_time_series = TimeSeriesScalerMinMax(value_range=(0.0, 1.0)).fit_transform(first_time_series) #first_time_series = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(first_time_series) print(first_time_series.shape) #treatment of the second variable second_train=df.loc[:,'Density'] second_train=np.array(second_train) second_train= second_train.reshape((len(second_train), 1)) #from array to list second_train=second_train.tolist() len(second_train) #for every day of the train set store the flow observations days_second=list(partition(48,second_train)) days_second len(days_second)
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies the k nearest neighbor classification algorithm to time series data. The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped. """ metadata = metadata_base.PrimitiveMetadata({ "id": "2d6d3223-1b3c-49cc-9ddd-50f571818268", "version": __version__, "name": "kanine", "keywords": [ "time series", "knn", "k nearest neighbor", "time series classification", ], "source": { "name": __author__, "contact": __contact__, "uris": [ "https://github.com/kungfuai/d3m-primitives", ], }, "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.16" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine", "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(scaler=None, classifier=None, output_columns=None) return Params( scaler=self._scaler, classifier=self._knn, output_columns=self._output_columns, ) def set_params(self, *, params: Params) -> None: self._scaler = params["scaler"] self._knn = params["classifier"] self._output_columns = params["output_columns"] self._is_fit = all(param is not None for param in params.values()) def _get_cols(self, input_metadata): """private util function that finds grouping column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: list[int] -- list of column indices annotated with GroupingKey metadata """ # find column with ts value through metadata grouping_column = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) return grouping_column def _get_value_col(self, input_metadata): """ private util function that finds the value column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: int -- index of column that contains time series value after Time Series Formatter primitive """ # find attribute column but not file column attributes = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/Attribute", )) # this is assuming alot, but timeseries formaters typicaly place value column at the end attribute_col = attributes[-1] return attribute_col def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets """ # load and reshape training data self._output_columns = outputs.columns outputs = np.array(outputs) n_ts = outputs.shape[0] ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) self._X_train = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) self._y_train = np.array(outputs).reshape(-1, ) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """Fits KNN model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ scaled = self._scaler.fit_transform(self._X_train) self._knn.fit(scaled, self._y_train) self._is_fit = True return CallResult(None, has_finished=self._is_fit) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) # make predictions scaled = self._scaler.transform(x_vals) preds = self._knn.predict(scaled) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=True)
def subseqeuence_clustering(sequence, changepoints, y_label='y', norm=False): """ Clusters subsequences of time series indicated by the changepoints variable. Uses silhouette score to determine the number of clusters :param y_label: Name of y-label in plot :param norm: normlise data using MinMaxScaler :param sequence: np array of the time series :param changepoints: detected changepoints on which subseuqences are build :return: """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax sub_ids = [] x_index = [] X = [] i = 0 end_p = [len(sequence) - 1] for cp in changepoints + end_p: X.append(sequence[i:cp]) index = 'sub_' + str(i) + '_' + str(cp) sub_ids.append(index) x_index.append([x_id for x_id in range(i, cp + 1)]) i = cp # Normalize the data (y = (x - min) / (max - min)) if norm: X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(changepoints)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) print('Number of Clusters in subsequence clustering: ' + str(opt_k)) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(sub_ids, x_index, model.labels_)), columns=['metric', 'x_index', 'cluster']) cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() print('Plotting Clusters') # plot changepoints as vertical lines for cp in changepoints: plt.axvline(x=cp, ls=':', lw=2, c='0.65') # preprocessing for plotting cluster based x_scat = [] y_scat = [] cluster = [] for index, row in df_cluster.iterrows(): x_seq = row['x_index'] x_scat.extend(x_seq) y_seq = sequence[x_seq[0]:x_seq[-1] + 1] y_scat.extend(y_seq) label_seq = [row['cluster']] cluster.extend(label_seq * len(x_seq)) # plt.scatter(x_seq, y_seq, label=label_seq) # plotting cluster based x_scat = np.array(x_scat) y_scat = np.array(y_scat) for c in np.unique(cluster): i = np.where(cluster == c) plt.scatter(x_scat[i], y_scat[i], label=c) plt.legend() plt.title('Subsequence k-means Clustering') plt.xlabel('Time index') plt.ylabel(y_label) plt.show() return cluster_metrics_dict
def tsScale(ts): tsc=TimeSeriesScalerMinMax(value_range=(-1,1)) scaled_ts=tsc.fit_transform(ts) return scaled_ts