def test_is_nested_dataframe(n_instances, n_columns, n_timepoints): array = np.random.normal(size=(n_instances, n_columns, n_timepoints)) nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints) zero_df = pd.DataFrame(np.zeros_like(nested)) nested_heterogenous = pd.concat([zero_df, nested], axis=1) mi_df = make_multi_index_dataframe( n_instances=n_instances, n_timepoints=n_timepoints, n_columns=n_columns ) assert not is_nested_dataframe(array) assert not is_nested_dataframe(mi_df) assert is_nested_dataframe(nested) assert is_nested_dataframe(nested_heterogenous)
def test_from_2d_array_to_nested(n_instances, n_columns, n_timepoints): rng = np.random.default_rng() X_2d = rng.standard_normal((n_instances, n_timepoints)) nested_df = from_2d_array_to_nested(X_2d) assert is_nested_dataframe(nested_df) assert nested_df.shape == (n_instances, 1)
def test_from_long_to_nested(n_instances, n_columns, n_timepoints): X_long = generate_example_long_table( num_cases=n_instances, series_len=n_timepoints, num_dims=n_columns ) nested_df = from_long_to_nested(X_long) assert is_nested_dataframe(nested_df) assert nested_df.shape == (n_instances, n_columns)
def check_panel_to_panel_transform_univariate(Estimator): n_instances = 5 out = _construct_fit_transform(Estimator, n_instances=n_instances) assert isinstance(out, (pd.DataFrame, np.ndarray)) assert out.shape[0] == n_instances if isinstance(out, np.ndarray): assert out.ndim == 3 if isinstance(out, pd.DataFrame): assert is_nested_dataframe(out)
def test_from_3d_numpy_to_nested(n_instances, n_columns, n_timepoints): array = np.random.normal(size=(n_instances, n_columns, n_timepoints)) nested = from_3d_numpy_to_nested(array) # check types and shapes assert is_nested_dataframe(nested) assert nested.shape == (n_instances, n_columns) assert nested.iloc[0, 0].shape[0] == n_timepoints # check values of random series np.testing.assert_array_equal(nested.iloc[1, 0], array[1, 0, :])
def test_from_multi_index_to_nested(n_instances, n_columns, n_timepoints): mi_df = make_multi_index_dataframe( n_instances=n_instances, n_timepoints=n_timepoints, n_columns=n_columns ) nested_df = from_multi_index_to_nested( mi_df, instance_index="case_id", cells_as_numpy=False ) assert is_nested_dataframe(nested_df) assert nested_df.shape == (n_instances, n_columns) assert (nested_df.columns == mi_df.columns).all()
def check_panel_to_panel_transform_multivariate(Estimator): n_instances = 5 if _has_tag(Estimator, "univariate-only"): _check_raises_error(Estimator, n_instances=n_instances, n_columns=3) else: out = _construct_fit_transform(Estimator, n_instances=n_instances, n_columns=3) assert isinstance(out, (pd.DataFrame, np.ndarray)) assert out.shape[0] == n_instances if isinstance(out, np.ndarray): assert out.ndim == 3 if isinstance(out, pd.DataFrame): assert is_nested_dataframe(out)
def _compare_nested_frame(func, x, y, **kwargs): """Helper function to compare two nested pd.DataFrames Parameters ---------- func : function Function from np.testing for comparing arrays. x : pd.DataFrame y : pd.DataFrame kwargs : dict Keyword argument for function Raises ------ AssertionError If x and y are not equal """ # We iterate over columns and rows to make cell-wise comparisons. # Tabularizing the data first would simplify this, but does not # work for unequal length data. # In rare cases, x and y may be empty (e.g. TSFreshRelevantFeatureExtractor) and # we cannot compare individual cells, so we simply check if everything else is # equal here. assert isinstance(x, pd.DataFrame) if x.empty: assert_frame_equal(x, y) elif is_nested_dataframe(x): # Check if both inputs have the same shape if not x.shape == y.shape: raise ValueError("Found inputs with different shapes") # Iterate over columns n_columns = x.shape[1] for i in range(n_columns): xc = x.iloc[:, i].tolist() yc = y.iloc[:, i].tolist() # Iterate over rows, checking if individual cells are equal for xci, yci in zip(xc, yc): func(xci, yci, **kwargs)
def check_X( X, enforce_univariate=False, enforce_min_instances=1, enforce_min_columns=1, coerce_to_numpy=False, coerce_to_pandas=False, ): """Validate input data. Parameters ---------- X : pd.DataFrame or np.array Input data enforce_univariate : bool, optional (default=False) Enforce that X is univariate. enforce_min_instances : int, optional (default=1) Enforce minimum number of instances. enforce_min_columns : int, optional (default=1) Enforce minimum number of columns (or time-series variables). coerce_to_numpy : bool, optional (default=False) If True, X will be coerced to a 3-dimensional numpy array. coerce_to_pandas : bool, optional (default=False) If True, X will be coerced to a nested pandas DataFrame. Returns ------- X : pd.DataFrame or np.array Checked and possibly converted input data Raises ------ ValueError If X is invalid input data """ # check input type if coerce_to_pandas and coerce_to_numpy: raise ValueError( "`coerce_to_pandas` and `coerce_to_numpy` cannot both be set to True" ) if not isinstance(X, VALID_X_TYPES): raise ValueError(f"X must be a pd.DataFrame or a np.array, " f"but found: {type(X)}") # check np.array # check first if we have the right number of dimensions, otherwise we # may not be able to get the shape of the second dimension below if isinstance(X, np.ndarray): if not X.ndim == 3: raise ValueError( f"If passed as a np.array, X must be a 3-dimensional " f"array, but found shape: {X.shape}") if coerce_to_pandas: X = from_3d_numpy_to_nested(X) # enforce minimum number of columns n_columns = X.shape[1] if n_columns < enforce_min_columns: raise ValueError( f"X must contain at least: {enforce_min_columns} columns, " f"but found only: {n_columns}.") # enforce univariate data if enforce_univariate and n_columns > 1: raise ValueError( f"X must be univariate with X.shape[1] == 1, but found: " f"X.shape[1] == {n_columns}.") # enforce minimum number of instances if enforce_min_instances > 0: _enforce_min_instances(X, min_instances=enforce_min_instances) # check pd.DataFrame if isinstance(X, pd.DataFrame): if not is_nested_dataframe(X): raise ValueError( "If passed as a pd.DataFrame, X must be a nested " "pd.DataFrame, with pd.Series or np.arrays inside cells.") # convert pd.DataFrame if coerce_to_numpy: X = from_nested_to_3d_numpy(X) return X