def distance(instance_a, instance_b, **params): # find distance instance_a = from_nested_to_2d_array( instance_a, return_numpy=True) # todo use specific # dimension rather than whole # thing? instance_b = from_nested_to_2d_array( instance_b, return_numpy=True) # todo use specific # dimension rather than whole thing? instance_a = np.transpose(instance_a) instance_b = np.transpose(instance_b) return distance_measure(instance_a, instance_b, **params)
def test_row_transformer_transform_inverse_transform(): X, y = load_gunpoint(return_X_y=True) t = RowTransformer(StandardScaler()) Xt = t.fit_transform(X) Xit = t.inverse_transform(Xt) assert Xit.shape == X.shape assert isinstance( Xit.iloc[0, 0], (pd.Series, np.ndarray) ) # check series-to-series transforms np.testing.assert_array_almost_equal( from_nested_to_2d_array(X).values, from_nested_to_2d_array(Xit).values, decimal=5, )
def test_from_nested_to_2d_array(n_instances, n_columns, n_timepoints): nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints) array = from_nested_to_2d_array(nested) assert array.shape == (n_instances, n_columns * n_timepoints) assert array.index.equals(nested.index)
def transform(self, X, y=None): """Concatenate multivariate time series/panel data into long univariate time series/panel data by simply concatenating times series in time. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and single column """ self.check_is_fitted() X = check_X(X) # We concatenate by tabularizing all columns and then detabularizing # them into a single column if isinstance(X, pd.DataFrame): Xt = from_nested_to_2d_array(X) else: Xt = from_3d_numpy_to_2d_array(X) return from_2d_array_to_nested(Xt)
def test_output_format_dim(len_series, n_instances, n_components): np.random.seed(42) X = from_2d_array_to_nested( pd.DataFrame(data=np.random.randn(n_instances, len_series))) trans = PCATransformer(n_components=n_components) Xt = trans.fit_transform(X) # Check number of rows and output type. assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == X.shape[0] # Check number of principal components in the output. assert from_nested_to_2d_array(Xt).shape[1] == min( n_components, from_nested_to_2d_array(X).shape[1])
def _combine_data_frames(self, dataFrames, weighting_factor, col_names): """ Helper function for the shape_dtw class to combine two dataframes together into a single dataframe. Used when the shape_descriptor_function is set to "compound". """ first_desc = dataFrames[0] second_desc = dataFrames[1] first_desc_array = [] second_desc_array = [] # Convert the dataframes into arrays for x in first_desc.columns: first_desc_array.append( from_nested_to_2d_array(first_desc[x], return_numpy=True) ) for x in second_desc.columns: second_desc_array.append( from_nested_to_2d_array(second_desc[x], return_numpy=True) ) # Concatenate the arrays together res = [] for x in range(len(first_desc_array)): dim1 = [] for y in range(len(first_desc_array[x])): dim2 = [] dim2.extend(first_desc_array[x][y]) dim2.extend(second_desc_array[x][y] * weighting_factor) dim1.append(dim2) res.append(dim1) res = np.asarray(res) # Convert to pandas dataframe df = pd.DataFrame() for col in col_names: colToAdd = [] for row in range(len(res[col])): inst = res[col][row] colToAdd.append(pd.Series(inst)) df[col] = colToAdd return df
def _handle_tabularizer_args(*args, **kwargs): # the Tabularizer transforms a nested pd.DataFrame/3d numpy array into a # 2d numpy array, so the inverse transform goes from a 2d numpy array to a # nested pd.DataFrame/3d array # TODO refactor Tabularizer as series-as-features composition meta-estimator, # rather than transformer or introduce special transformer type X, y = args if "return_numpy" in kwargs and kwargs["return_numpy"]: return from_3d_numpy_to_2d_array(X), y else: return from_nested_to_2d_array(X), y
def row_first(X): if isinstance(X, pd.Series): X = pd.DataFrame(X) Xt = pd.concat( [ pd.Series(from_nested_to_2d_array(col).iloc[:, 0]) for _, col in X.items() ], axis=1, ) return Xt
def test_tsfresh_extractor(default_fc_parameters): X, y = make_classification_problem() X_train, X_test, y_train, y_test = train_test_split(X, y) transformer = TSFreshFeatureExtractor( default_fc_parameters=default_fc_parameters, disable_progressbar=True) Xt = transformer.fit_transform(X_train, y_train) actual = Xt.filter(like="__mean", axis=1).values.ravel() expected = from_nested_to_2d_array(X_train).mean(axis=1).values assert expected[0] == X_train.iloc[0, 0].mean() np.testing.assert_allclose(actual, expected)
def _apply_rowwise(self, func, X, y=None): """Helper function to apply transform or inverse_transform function on each row of data container""" self.check_is_fitted() X = check_X(X, coerce_to_pandas=True) # 1st attempt: apply, relatively fast but not robust # try and except, but sometimes breaks in other cases than excepted # ValueError # Works on single column, but on multiple columns only if columns # have equal-length series. # try: # Xt = X.apply(self.transformer.fit_transform) # # # Otherwise call apply on each column separately. # except ValueError as e: # if str(e) == "arrays must all be same length": # Xt = pd.concat([pd.Series(col.apply( # self.transformer.fit_transform)) for _, col in X.items()], # axis=1) # else: # raise # 2nd attempt: apply but iterate over columns, still relatively fast # but still not very robust # but column is not 2d and thus breaks if transformer expects 2d input try: Xt = pd.concat( [pd.Series(col.apply(func)) for _, col in X.items()], axis=1) # 3rd attempt: explicit for-loops, most robust but very slow except Exception: cols_t = [] for c in range(X.shape[1]): # loop over columns col = X.iloc[:, c] rows_t = [] for row in col: # loop over rows in each column row_2d = pd.DataFrame(row) # convert into 2d dataframe row_t = func(row_2d).ravel() # apply transform rows_t.append(row_t) # append transformed rows cols_t.append(rows_t) # append transformed columns # if series-to-series transform, flatten transformed series Xt = _concat_nested_arrays( cols_t) # concatenate transformed columns # tabularise/unnest series-to-primitive transforms xt = Xt.iloc[0, 0] if isinstance(xt, (pd.Series, np.ndarray)) and len(xt) == 1: Xt = from_nested_to_2d_array(Xt) return Xt
def transform(self, X, y=None): """ Function to transform a data frame of time series data. Parameters ---------- X : a pandas dataframe of shape = [n_samples, num_dims] The training input samples. Returns ------- dims: a pandas data frame of shape = [n_samples, num_dims] """ # Check the data self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) # Get information about the dataframe num_insts = X.shape[0] col_names = X.columns num_atts = len(X.iloc[0, 0]) # Check the parameters are appropriate self._check_parameters(num_atts) df = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to a numpy array arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True) # Get the HOG1Ds of each time series transformedData = [] for y in range(num_insts): inst = self._calculate_hog1ds(arr[y]) transformedData.append(inst) # Convert to numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) df[x] = colToAdd return df
def test_pca_results(n_components): np.random.seed(42) # sklearn X = pd.DataFrame(data=np.random.randn(10, 5)) pca = PCA(n_components=n_components) Xt1 = pca.fit_transform(X) # sktime Xs = from_2d_array_to_nested(X) pca_transform = PCATransformer(n_components=n_components) Xt2 = pca_transform.fit_transform(Xs) assert np.allclose(np.asarray(Xt1), np.asarray(from_nested_to_2d_array(Xt2)))
def test_padding_transformer(): # load data name = "JapaneseVowels" X_train, y_train = _load_dataset(name, split="train", return_X_y=True) X_test, y_test = _load_dataset(name, split="test", return_X_y=True) # print(X_train) padding_transformer = PaddingTransformer() Xt = padding_transformer.fit_transform(X_train) # when we tabulrize the data it has 12 dimensions # and we've padded them to there normal length of 29 data = from_nested_to_2d_array(Xt) assert len(data.columns) == 29 * 12
def test_truncation_paramterised_transformer(): # load data name = "JapaneseVowels" X_train, y_train = _load_dataset(name, split="train", return_X_y=True) X_test, y_test = _load_dataset(name, split="test", return_X_y=True) # print(X_train) truncated_transformer = TruncationTransformer(2, 10) Xt = truncated_transformer.fit_transform(X_train) # when we tabulrize the data it has 12 dimensions # and we've truncated them all to (10-2) long. data = from_nested_to_2d_array(Xt) assert len(data.columns) == 8 * 12
def test_padding_fill_value_transformer(): # load data name = "JapaneseVowels" X_train, y_train = _load_dataset(name, split="train", return_X_y=True) X_test, y_test = _load_dataset(name, split="test", return_X_y=True) # print(X_train) padding_transformer = PaddingTransformer(pad_length=40, fill_value=1) Xt = padding_transformer.fit_transform(X_train) # when we tabulrize the data it has 12 dimensions # and we've truncated them all to (10-2) long. data = from_nested_to_2d_array(Xt) assert len(data.columns) == 40 * 12
def transform(self, X, y=None): """ Parameters ---------- X : a pandas dataframe of shape = [n_samples, num_dims] The training input samples. Returns ------- df: a pandas data frame of shape = [num_intervals, num_dims] """ # Check the data self.check_is_fitted() X = check_X(X, coerce_to_pandas=True) # Get information about the dataframe n_timepoints = len(X.iloc[0, 0]) num_instances = X.shape[0] col_names = X.columns self._check_parameters(n_timepoints) df = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to numpy array arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True) # Calculate gradients transformedData = [] for y in range(num_instances): res = self._get_gradients_of_lines(arr[y]) transformedData.append(res) # Convert to Numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) df[x] = colToAdd return df
def _perform_paa_along_dim(self, X): X = from_nested_to_2d_array(X, return_numpy=True) num_atts = X.shape[1] num_insts = X.shape[0] dims = pd.DataFrame() data = [] for i in range(num_insts): series = X[i, :] frames = [] current_frame = 0 current_frame_size = 0 frame_length = num_atts / self.num_intervals frame_sum = 0 for n in range(num_atts): remaining = frame_length - current_frame_size if remaining > 1: frame_sum += series[n] current_frame_size += 1 else: frame_sum += remaining * series[n] current_frame_size += remaining if current_frame_size == frame_length: frames.append(frame_sum / frame_length) current_frame += 1 frame_sum = (1 - remaining) * series[n] current_frame_size = 1 - remaining # if the last frame was lost due to double imprecision if current_frame == self.num_intervals - 1: frames.append(frame_sum / frame_length) data.append(pd.Series(frames)) dims[0] = data return dims
def transform(self, X, y=None): """ Parameters ---------- X : a pandas dataframe of shape = [n_samples, num_dims] The training input samples. Returns ------- dims: a pandas data frame of shape = [n_samples, num_dims] """ # Check the data self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) self._check_parameters() # Get information about the dataframe col_names = X.columns df = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to numpy array arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True) transformedData = self._extract_wavelet_coefficients(arr) # Convert to a numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) df[x] = colToAdd return df
def transform(self, X, y=None): """Transform nested pandas dataframe into tabular dataframe. Parameters ---------- X : pandas DataFrame Nested dataframe with pandas series or numpy arrays in cells. y : array-like, optional (default=None) Returns ------- Xt : pandas DataFrame Transformed dataframe with only primitives in cells. """ self.check_is_fitted() X = check_X(X) if isinstance(X, pd.DataFrame): return from_nested_to_2d_array(X) else: return from_3d_numpy_to_2d_array(X)
def test_dft_mft(): # load training data X, Y = load_gunpoint(split="train", return_X_y=True) X_tab = from_nested_to_2d_array(X, return_numpy=True) word_length = 6 alphabet_size = 4 # print("Single DFT transformation") window_size = np.shape(X_tab)[1] p = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, binning_method="equi-depth", ).fit(X, Y) dft = p._discrete_fourier_transform(X_tab[0]) mft = p._mft(X_tab[0]) assert (mft - dft < 0.0001).all() # print("Windowed DFT transformation") for norm in [True, False]: for window_size in [140]: p = SFA( word_length=word_length, norm=norm, alphabet_size=alphabet_size, window_size=window_size, binning_method="equi-depth", ).fit(X, Y) mft = p._mft(X_tab[0]) for i in range(len(X_tab[0]) - window_size + 1): dft_transformed = p._discrete_fourier_transform( X_tab[0, i : window_size + i] ) assert (mft[i] - dft_transformed < 0.001).all() assert len(mft) == len(X_tab[0]) - window_size + 1 assert len(mft[0]) == word_length
def _transform_single_feature(self, X, feature): """transforms data into the catch22 features Parameters ---------- X : pandas DataFrame, input time series feature : int, catch22 feature id or String, catch22 feature name. Returns ------- Numpy array containing a catch22 feature for each input series """ if isinstance(feature, int): if feature > 21 or feature < 0: raise ValueError("Invalid catch22 feature ID") elif isinstance(feature, str): if feature in feature_names: feature = feature_names.index(feature) else: raise ValueError("Invalid catch22 feature name") else: raise ValueError("Feature name or ID required") if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) c22_list = [] for i in range(n_instances): series = X[i, :].tolist() c22_val = features[feature](series) c22_list.append(c22_val) return np.array(c22_list)