def distance(instance_a, instance_b, **params): # find distance instance_a = from_nested_to_2d_array( instance_a, return_numpy=True) # todo use specific # dimension rather than whole # thing? instance_b = from_nested_to_2d_array( instance_b, return_numpy=True) # todo use specific # dimension rather than whole thing? instance_a = np.transpose(instance_a) instance_b = np.transpose(instance_b) return distance_measure(instance_a, instance_b, **params)
def fit(self, X: NumpyOrDF, y: NumpyOrDF = None): """ Method that is used to fit the clustering algorithm on the dataset X Parameters ---------- X: Numpy array or Dataframe sktime data_frame or numpy array to train the model on y: Numpy array of Dataframe, default = None sktime data_frame or numpy array that is the labels for training. Unlikely to be used for clustering but kept for consistency Returns ------- self Fitted estimator """ if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) self._check_params(X) self._fit(X) self._is_fitted = True return self
def transform(self, X, y=None): """Concatenate multivariate time series/panel data into long univariate time series/panel data by simply concatenating times series in time. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and single column """ self.check_is_fitted() X = check_X(X) # We concatenate by tabularizing all columns and then detabularizing # them into a single column if isinstance(X, pd.DataFrame): Xt = from_nested_to_2d_array(X) else: Xt = from_3d_numpy_to_2d_array(X) return from_2d_array_to_nested(Xt)
def test_from_nested_to_2d_array(n_instances, n_columns, n_timepoints): nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints) array = from_nested_to_2d_array(nested) assert array.shape == (n_instances, n_columns * n_timepoints) assert array.index.equals(nested.index)
def test_output_format_dim(len_series, n_instances, n_components): np.random.seed(42) X = from_2d_array_to_nested( pd.DataFrame(data=np.random.randn(n_instances, len_series))) trans = PCATransformer(n_components=n_components) Xt = trans.fit_transform(X) # Check number of rows and output type. assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == X.shape[0] # Check number of principal components in the output. assert from_nested_to_2d_array(Xt).shape[1] == min( n_components, from_nested_to_2d_array(X).shape[1])
def _combine_data_frames(self, dataFrames, weighting_factor, col_names): """ Helper function for the shape_dtw class to combine two dataframes together into a single dataframe. Used when the shape_descriptor_function is set to "compound". """ first_desc = dataFrames[0] second_desc = dataFrames[1] first_desc_array = [] second_desc_array = [] # Convert the dataframes into arrays for x in first_desc.columns: first_desc_array.append( from_nested_to_2d_array(first_desc[x], return_numpy=True) ) for x in second_desc.columns: second_desc_array.append( from_nested_to_2d_array(second_desc[x], return_numpy=True) ) # Concatenate the arrays together res = [] for x in range(len(first_desc_array)): dim1 = [] for y in range(len(first_desc_array[x])): dim2 = [] dim2.extend(first_desc_array[x][y]) dim2.extend(second_desc_array[x][y] * weighting_factor) dim1.append(dim2) res.append(dim1) res = np.asarray(res) # Convert to pandas dataframe df = pd.DataFrame() for col in col_names: colToAdd = [] for row in range(len(res[col])): inst = res[col][row] colToAdd.append(pd.Series(inst)) df[col] = colToAdd return df
def row_first(X): if isinstance(X, pd.Series): X = pd.DataFrame(X) Xt = pd.concat( [ pd.Series(from_nested_to_2d_array(col).iloc[:, 0]) for _, col in X.items() ], axis=1, ) return Xt
def test_tsfresh_extractor(default_fc_parameters): X, y = make_classification_problem() X_train, X_test, y_train, y_test = train_test_split(X, y) transformer = TSFreshFeatureExtractor( default_fc_parameters=default_fc_parameters, disable_progressbar=True) Xt = transformer.fit_transform(X_train, y_train) actual = Xt.filter(like="__mean", axis=1).values.ravel() expected = from_nested_to_2d_array(X_train).mean(axis=1).values assert expected[0] == X_train.iloc[0, 0].mean() np.testing.assert_allclose(actual, expected)
def transform(self, X, y=None): """ Function to transform a data frame of time series data. Parameters ---------- X : a pandas dataframe of shape = [n_samples, num_dims] The training input samples. Returns ------- dims: a pandas data frame of shape = [n_samples, num_dims] """ # Check the data self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) # Get information about the dataframe num_insts = X.shape[0] col_names = X.columns num_atts = len(X.iloc[0, 0]) # Check the parameters are appropriate self._check_parameters(num_atts) df = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to a numpy array arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True) # Get the HOG1Ds of each time series transformedData = [] for y in range(num_insts): inst = self._calculate_hog1ds(arr[y]) transformedData.append(inst) # Convert to numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) df[x] = colToAdd return df
def read_dataset(root_dir, dataset_name): datasets_dict = {} curr_root_dir = root_dir.replace('-temp', '') #For UCR root_dir_dataset = curr_root_dir + '/' + 'UCRArchive_2018' x_train, y_train = load_from_tsfile_to_dataframe(root_dir_dataset + '/' + dataset_name + '/' + dataset_name + '_TRAIN.ts') x_test, y_test = load_from_tsfile_to_dataframe(root_dir_dataset + '/' + dataset_name + '/' + dataset_name + '_TEST.ts') #x_train, y_train = load_from_arff_to_dataframe(root_dir_dataset + '/'+ dataset_name + '/' + dataset_name + '_TRAIN.arff') #x_test, y_test = load_from_arff_to_dataframe(root_dir_dataset + '/'+ dataset_name + '/' + dataset_name + '_TEST.arff') #print(x_train) x_train = from_nested_to_2d_array(x_train, return_numpy=True) x_test = from_nested_to_2d_array(x_test, return_numpy=True) # znorm std_ = x_train.std(axis=1, keepdims=True) std_[std_ == 0] = 1.0 x_train = (x_train - x_train.mean(axis=1, keepdims=True)) / std_ std_ = x_test.std(axis=1, keepdims=True) std_[std_ == 0] = 1.0 x_test = (x_test - x_test.mean(axis=1, keepdims=True)) / std_ datasets_dict[dataset_name] = (x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()) return datasets_dict
def test_pca_results(n_components): np.random.seed(42) # sklearn X = pd.DataFrame(data=np.random.randn(10, 5)) pca = PCA(n_components=n_components) Xt1 = pca.fit_transform(X) # sktime Xs = from_2d_array_to_nested(X) pca_transform = PCATransformer(n_components=n_components) Xt2 = pca_transform.fit_transform(Xs) assert np.allclose(np.asarray(Xt1), np.asarray(from_nested_to_2d_array(Xt2)))
def test_padding_transformer(): # load data name = "JapaneseVowels" X_train, y_train = _load_dataset(name, split="train", return_X_y=True) X_test, y_test = _load_dataset(name, split="test", return_X_y=True) # print(X_train) padding_transformer = PaddingTransformer() Xt = padding_transformer.fit_transform(X_train) # when we tabulrize the data it has 12 dimensions # and we've padded them to there normal length of 29 data = from_nested_to_2d_array(Xt) assert len(data.columns) == 29 * 12
def test_padding_fill_value_transformer(): # load data name = "JapaneseVowels" X_train, y_train = _load_dataset(name, split="train", return_X_y=True) X_test, y_test = _load_dataset(name, split="test", return_X_y=True) # print(X_train) padding_transformer = PaddingTransformer(pad_length=40, fill_value=1) Xt = padding_transformer.fit_transform(X_train) # when we tabulrize the data it has 12 dimensions # and we've truncated them all to (10-2) long. data = from_nested_to_2d_array(Xt) assert len(data.columns) == 40 * 12
def test_truncation_paramterised_transformer(): # load data name = "JapaneseVowels" X_train, y_train = _load_dataset(name, split="train", return_X_y=True) X_test, y_test = _load_dataset(name, split="test", return_X_y=True) # print(X_train) truncated_transformer = TruncationTransformer(2, 10) Xt = truncated_transformer.fit_transform(X_train) # when we tabulrize the data it has 12 dimensions # and we've truncated them all to (10-2) long. data = from_nested_to_2d_array(Xt) assert len(data.columns) == 8 * 12
def transform(self, X, y=None): """ Parameters ---------- X : a pandas dataframe of shape = [n_samples, num_dims] The training input samples. Returns ------- df: a pandas data frame of shape = [num_intervals, num_dims] """ # Check the data self.check_is_fitted() X = check_X(X, coerce_to_pandas=True) # Get information about the dataframe n_timepoints = len(X.iloc[0, 0]) num_instances = X.shape[0] col_names = X.columns self._check_parameters(n_timepoints) df = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to numpy array arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True) # Calculate gradients transformedData = [] for y in range(num_instances): res = self._get_gradients_of_lines(arr[y]) transformedData.append(res) # Convert to Numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) df[x] = colToAdd return df
def plot_cluster_algorithm(model: BaseClusterer, predict_series: NumpyOrDF, k: int): """ Method that is used to plot a clustering algorithms output Parameters ---------- model: BaseClusterer Clustering model to plot predict_series: Numpy or Dataframe The series to predict the values for k: int Number of centers """ _check_soft_dependencies("matplotlib") import matplotlib.pyplot as plt import matplotlib.patches as mpatches if isinstance(predict_series, pd.DataFrame): predict_series = from_nested_to_2d_array(predict_series, return_numpy=True) plt.figure(figsize=(5, 10)) plt.rcParams["figure.dpi"] = 100 indexes = model.predict(predict_series) centers = model.get_centers() series_values = TimeSeriesLloydsPartitioning.get_cluster_values( indexes, predict_series, k ) fig, axes = plt.subplots(nrows=k, ncols=1) for i in range(k): _plot(series_values[i], centers[i], axes[i]) blue_patch = mpatches.Patch(color="blue", label="Series that belong to the cluster") red_patch = mpatches.Patch(color="red", label="Cluster centers") plt.legend( handles=[red_patch, blue_patch], loc="upper center", bbox_to_anchor=(0.5, -0.40), fancybox=True, shadow=True, ncol=5, ) plt.tight_layout() plt.show()
def _perform_paa_along_dim(self, X): X = from_nested_to_2d_array(X, return_numpy=True) num_atts = X.shape[1] num_insts = X.shape[0] dims = pd.DataFrame() data = [] for i in range(num_insts): series = X[i, :] frames = [] current_frame = 0 current_frame_size = 0 frame_length = num_atts / self.num_intervals frame_sum = 0 for n in range(num_atts): remaining = frame_length - current_frame_size if remaining > 1: frame_sum += series[n] current_frame_size += 1 else: frame_sum += remaining * series[n] current_frame_size += remaining if current_frame_size == frame_length: frames.append(frame_sum / frame_length) current_frame += 1 frame_sum = (1 - remaining) * series[n] current_frame_size = 1 - remaining # if the last frame was lost due to double imprecision if current_frame == self.num_intervals - 1: frames.append(frame_sum / frame_length) data.append(pd.Series(frames)) dims[0] = data return dims
def transform(self, X, y=None): """ Parameters ---------- X : a pandas dataframe of shape = [n_samples, num_dims] The training input samples. Returns ------- dims: a pandas data frame of shape = [n_samples, num_dims] """ # Check the data self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) self._check_parameters() # Get information about the dataframe col_names = X.columns df = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to numpy array arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True) transformedData = self._extract_wavelet_coefficients(arr) # Convert to a numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) df[x] = colToAdd return df
def transform(self, X, y=None): """Transform nested pandas dataframe into tabular dataframe. Parameters ---------- X : pandas DataFrame Nested dataframe with pandas series or numpy arrays in cells. y : array-like, optional (default=None) Returns ------- Xt : pandas DataFrame Transformed dataframe with only primitives in cells. """ self.check_is_fitted() X = check_X(X) if isinstance(X, pd.DataFrame): return from_nested_to_2d_array(X) else: return from_3d_numpy_to_2d_array(X)
def _transform_single_feature(self, X, feature): """transforms data into a specified catch22 feature Parameters ---------- X : pandas DataFrame, input time series feature : int, catch22 feature id or String, catch22 feature name. Returns ------- Numpy array containing a catch22 feature for each input series """ if isinstance(feature, (int, np.integer)) or isinstance( feature, (float, np.float) ): if feature > 21 or feature < 0: raise ValueError("Invalid catch22 feature ID") elif isinstance(feature, str): if feature in feature_names: feature = feature_names.index(feature) else: raise ValueError("Invalid catch22 feature name") else: raise ValueError("catch22 feature name or ID required") if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) c22_list = Parallel(n_jobs=self.n_jobs)( delayed(self._transform_case_single)( X[i], feature, ) for i in range(n_instances) ) return np.asarray(c22_list)
def test_dft_mft(): # load training data X, Y = load_gunpoint(split="train", return_X_y=True) X_tab = from_nested_to_2d_array(X, return_numpy=True) word_length = 6 alphabet_size = 4 # print("Single DFT transformation") window_size = np.shape(X_tab)[1] p = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, binning_method="equi-depth", ).fit(X, Y) dft = p._discrete_fourier_transform(X_tab[0]) mft = p._mft(X_tab[0]) assert (mft - dft < 0.0001).all() # print("Windowed DFT transformation") for norm in [True, False]: for window_size in [140]: p = SFA( word_length=word_length, norm=norm, alphabet_size=alphabet_size, window_size=window_size, binning_method="equi-depth", ).fit(X, Y) mft = p._mft(X_tab[0]) for i in range(len(X_tab[0]) - window_size + 1): dft_transformed = p._discrete_fourier_transform( X_tab[0, i:window_size + i]) assert (mft[i] - dft_transformed < 0.001).all() assert len(mft) == len(X_tab[0]) - window_size + 1 assert len(mft[0]) == word_length
def predict(self, X: NumpyOrDF) -> NumpyArray: """ Method used to perform a prediction from the already trained clustering algorithm Parameters ---------- X: Numpy array or Dataframe sktime data_frame or numpy array to predict cluster for Returns ------- Numpy_Array: np.array Index of the cluster each sample belongs to """ self.check_is_fitted() if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) return self._predict(X)
def _transform_single_feature(self, X, feature): """transforms data into the catch22 features Parameters ---------- X : pandas DataFrame, input time series feature : int, catch22 feature id or String, catch22 feature name. Returns ------- Numpy array containing a catch22 feature for each input series """ if isinstance(feature, (int, np.integer)) or isinstance( feature, (float, np.float) ): if feature > 21 or feature < 0: raise ValueError("Invalid catch22 feature ID") elif isinstance(feature, str): if feature in feature_names: feature = feature_names.index(feature) else: raise ValueError("Invalid catch22 feature name") else: raise ValueError("catch22 feature name or ID required") if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) c22_list = [] for i in range(n_instances): series = X[i, :].tolist() c22_val = features[feature](series) c22_list.append(c22_val) return np.array(c22_list)
def predict(self, X: NumpyOrDF, y=None) -> NumpyArray: """ Return cluster center index for data samples. Parameters ---------- X: 2D np.array with shape (n_instances, n_timepoints) or pd.DataFrame in nested format panel of time series to cluster y: ignored, exists for API consistency reasons Returns ------- Numpy_Array: 1D np.array of length n_instances Index of the cluster each sample belongs to """ self.check_is_fitted() if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) self._check_params(X) return self._predict(X)
def fit(self, X: NumpyOrDF, y=None): """ Fit the clustering algorithm on the dataset X Parameters ---------- X: 2D np.array with shape (n_instances, n_timepoints) or pd.DataFrame in nested format panel of univariate time series to train the clustering model on y: ignored, exists for API consistency reasons Returns ------- reference to self """ if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) self._check_params(X) self._fit(X) self._is_fitted = True return self
def _univariate_nested_df_to_array(X): return from_nested_to_2d_array(X, return_array=True)
def _univariate_nested_df_to_array(X): return from_nested_to_2d_array(X, return_numpy=False)