def test_vectorization_series_to_hier_proba(method, mtype): """Test that forecaster vectorization works for Hierarchical data, predict_proba. This test passes Hierarchical data to the ARIMA forecaster which internally has an implementation for Series only, so the BaseForecaster has to vectorize. """ hierarchy_levels = (2, 4) y = _make_hierarchical(hierarchy_levels=hierarchy_levels, random_state=84) y = convert(y, from_type="pd_multiindex_hier", to_type=mtype) est = ARIMA().fit(y) y_pred = getattr(est, method)([1, 2, 3]) if method in ["predict_interval", "predict_quantiles"]: expected_mtype = method.replace("ict", "") elif method in ["predict_var"]: expected_mtype = "pd_multiindex_hier" else: RuntimeError(f"bug in test, unreachable state, method {method} queried") valid, _, _ = check_is_mtype(y_pred, expected_mtype, return_metadata=True) msg = ( f"vectorization of forecaster method {method} does not work for test example " f"of mtype {mtype}, using the ARIMA forecaster" ) assert valid, msg
def _fit(self, X, y=None): """Fit dilations and biases to input time series. Parameters ---------- X : 3D np.ndarray of shape = [n_instances, n_dimensions, series_length] panel of time series to transform y : ignored argument for interface compatibility Returns ------- self """ X = X.astype(np.float64) X = convert(X, from_type="numpy3D", to_type="numpyflat", as_scitype="Panel") if self.normalise: X = (X - X.mean(axis=-1, keepdims=True)) / ( X.std(axis=-1, keepdims=True) + 1e-8) self.parameter = self._get_parameter(X) _X1 = np.diff(X, 1) self.parameter1 = self._get_parameter(_X1) return self
def _combine_data_frames(self, dataFrames, weighting_factor, col_names): """Combine two dataframes together into a single dataframe. Used when the shape_descriptor_function is set to "compound". """ first_desc = dataFrames[0] second_desc = dataFrames[1] first_desc_array = [] second_desc_array = [] # Convert the dataframes into arrays for x in first_desc.columns: first_desc_array.append( convert(first_desc[x], from_type="nested_univ", to_type="numpyflat")) for x in second_desc.columns: second_desc_array.append( convert(first_desc[x], from_type="nested_univ", to_type="numpyflat")) # Concatenate the arrays together res = [] for x in range(len(first_desc_array)): dim1 = [] for y in range(len(first_desc_array[x])): dim2 = [] dim2.extend(first_desc_array[x][y]) dim2.extend(second_desc_array[x][y] * weighting_factor) dim1.append(dim2) res.append(dim1) res = np.asarray(res) # Convert to pandas dataframe df = pd.DataFrame() for col in col_names: colToAdd = [] for row in range(len(res[col])): inst = res[col][row] colToAdd.append(pd.Series(inst)) df[col] = colToAdd return df
def _make_panel( n_instances=20, n_columns=1, n_timepoints=20, y=None, all_positive=False, random_state=None, return_mtype="pd-multiindex", ): """Generate sktime compatible test data, Panel data formats. Parameters ---------- n_instances : int, optional, default=20 number of instances per series in the panel n_columns : int, optional, default=1 number of variables in the time series n_timepoints : int, optional, default=20 number of time points in each series y : None (default), or 1D np.darray or 1D array-like, shape (n_instances, ) if passed, return will be generated with association to y all_positive : bool, optional, default=False whether series contain only positive values when generated random_state : None (default) or int if int is passed, will be used in numpy RandomState for generation return_mtype : str, sktime Panel mtype str, default="pd-multiindex" see sktime.datatypes.MTYPE_LIST_PANEL for a full list of admissible strings see sktime.datatypes.MTYPE_REGISTER for an short explanation of formats see examples/AA_datatypes_and_datasets.ipynb for a full specification Returns ------- X : an sktime time series data container of mtype return_mtype with n_instances instances, n_columns variables, n_timepoints time points generating distribution is all values i.i.d. normal with std 0.5 if y is passed, i-th series values are additively shifted by y[i] * 100 """ # If target variable y is given, we ignore n_instances and instead generate as # many instances as in the target variable if y is not None: y = np.asarray(y) n_instances = len(y) rng = check_random_state(random_state) # Generate data as 3d numpy array X = rng.normal(scale=0.5, size=(n_instances, n_columns, n_timepoints)) # Generate association between data and target variable if y is not None: X = X + (y * 100).reshape(-1, 1, 1) if all_positive: X = X**2 X = convert(X, from_type="numpy3D", to_type=return_mtype) return X
def _transform(self, X, y=None): """Transform X and return a transformed version. private _transform containing core logic, called from transform Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_features] each cell of X must contain pandas.Series Data to fit transform to y : ignored argument for interface compatibility Additional data, e.g., labels for transformation Returns ------- Xt : nested pandas DataFrame of shape [n_instances, n_features] each cell of Xt contains pandas.Series transformed version of X """ # Get information about the dataframe n_timepoints = len(X.iloc[0, 0]) num_instances = X.shape[0] col_names = X.columns self._check_parameters(n_timepoints) Xt = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to numpy array arr = convert( pd.DataFrame(X[x]), from_type="nested_univ", to_type="numpyflat", as_scitype="Panel", ) # Calculate gradients transformedData = [] for y in range(num_instances): res = self._get_gradients_of_lines(arr[y]) transformedData.append(res) # Convert to Numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) Xt[x] = colToAdd return Xt
def _preprocess(self, X): # private method for performing the transformations on # the test/training data. It extracts the subsequences # and then performs the shape descriptor function on # each subsequence. X = convert(X, from_type="numpy3D", to_type="nested_univ") X = self.sw.transform(X) # Feed X into the appropriate shape descriptor function X = self._generate_shape_descriptors(X) return X
def test_tsfresh_extractor(default_fc_parameters): """Test that mean feature of TSFreshFeatureExtract is identical with sample mean.""" X, _ = make_classification_problem() transformer = TSFreshFeatureExtractor( default_fc_parameters=default_fc_parameters, disable_progressbar=True) Xt = transformer.fit_transform(X) actual = Xt.filter(like="__mean", axis=1).values.ravel() converted = convert(X, from_type="nested_univ", to_type="pd-wide") expected = converted.mean(axis=1).values assert expected[0] == X.iloc[0, 0].mean() np.testing.assert_allclose(actual, expected)
def _transform(self, X, y=None): """Transform X and return a transformed version. private _transform containing core logic, called from transform Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_features] each cell of X must contain pandas.Series Data to fit transform to y : ignored argument for interface compatibility Additional data, e.g., labels for transformation Returns ------- Xt : nested pandas DataFrame of shape [n_instances, n_features] each cell of Xt contains pandas.Series transformed version of X """ self._check_parameters() # Get information about the dataframe col_names = X.columns Xt = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to numpy array arr = convert( pd.DataFrame(X[x]), from_type="nested_univ", to_type="numpyflat", as_scitype="Panel", ) transformedData = self._extract_wavelet_coefficients(arr) # Convert to a numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) Xt[x] = colToAdd return Xt
def _transform(self, X, y=None): """Transform input time series using random convolutional kernels. Parameters ---------- X : 3D np.ndarray of shape = [n_instances, n_dimensions, series_length] panel of time series to transform y : ignored argument for interface compatibility Returns ------- pandas DataFrame, transformed features """ X = X.astype(np.float64) X = convert(X, from_type="numpy3D", to_type="numpyflat", as_scitype="Panel") if self.normalise: X = (X - X.mean(axis=-1, keepdims=True)) / ( X.std(axis=-1, keepdims=True) + 1e-8) X1 = np.diff(X, 1) # change n_jobs dependend on value and existing cores prev_threads = get_num_threads() if self.n_jobs < 1 or self.n_jobs > multiprocessing.cpu_count(): n_jobs = multiprocessing.cpu_count() else: n_jobs = self.n_jobs set_num_threads(n_jobs) X = _transform( X, X1, self.parameter, self.parameter1, self.n_features_per_kernel, ) X = np.nan_to_num(X) set_num_threads(prev_threads) # # from_2d_array_to_3d_numpy # _X = np.reshape(_X, (_X.shape[0], 1, _X.shape[1])).astype(np.float64) return pd.DataFrame(X)
def inverse_transform(self, X, y=None): """Transform tabular pandas dataframe into nested dataframe. Parameters ---------- X : pandas DataFrame Tabular dataframe with primitives in cells. y : array-like, optional (default=None) Returns ------- Xt : pandas DataFrame Transformed dataframe with series in cells. """ Xt = convert(X, from_type="numpyflat", to_type="numpy3D", as_scitype="Panel") return Xt
def test_vectorization_series_to_hier(mtype): """Test that forecaster vectorization works for Hierarchical data. This test passes Hierarchical data to the ARIMA forecaster which internally has an implementation for Series only, so the BaseForecaster has to vectorize. """ hierarchy_levels = (2, 4) n_instances = reduce(mul, hierarchy_levels) y = _make_hierarchical(hierarchy_levels=hierarchy_levels, random_state=84) y = convert(y, from_type="pd_multiindex_hier", to_type=mtype) y_pred = ARIMA().fit(y).predict([1, 2, 3]) valid, _, metadata = check_is_mtype(y_pred, mtype, return_metadata=True) msg = ( f"vectorization of forecasters does not work for test example " f"of mtype {mtype}, using the ARIMA forecaster" ) assert valid, msg y_pred_instances = metadata["n_instances"] msg = ( f"vectorization test produces wrong number of instances " f"expected {n_instances}, found {y_pred_instances}" ) assert y_pred_instances == n_instances, msg y_pred_equal_length = metadata["is_equal_length"] msg = ( "vectorization test produces non-equal length Panel forecast, should be " "equal length, and length equal to the forecasting horizon [1, 2, 3]" ) assert y_pred_equal_length, msg
def _check_ys(self, y_true, y_pred, multioutput): if multioutput is None: multioutput = self.multioutput valid, msg, metadata = check_is_scitype(y_pred, scitype="Proba", return_metadata=True, var_name="y_pred") if not valid: raise TypeError(msg) y_pred_mtype = metadata["mtype"] inner_y_pred_mtype = self.get_tag("scitype:y_pred") y_pred_inner = convert( y_pred, from_type=y_pred_mtype, to_type=inner_y_pred_mtype, as_scitype="Proba", ) y_true, y_pred, multioutput = self._check_consistent_input( y_true, y_pred, multioutput) return y_true, y_pred_inner, multioutput