def test_is_nested_dataframe(n_instances, n_columns, n_timepoints): """Test is_nested_dataframe for correctness.""" array = np.random.normal(size=(n_instances, n_columns, n_timepoints)) nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints) zero_df = pd.DataFrame(np.zeros_like(nested)) nested_heterogenous = pd.concat([zero_df, nested], axis=1) mi_df = make_multi_index_dataframe( n_instances=n_instances, n_timepoints=n_timepoints, n_columns=n_columns ) assert not is_nested_dataframe(array) assert not is_nested_dataframe(mi_df) assert is_nested_dataframe(nested) assert is_nested_dataframe(nested_heterogenous)
def test_from_2d_array_to_nested(n_instances, n_columns, n_timepoints): """Test from_2d_array_to_nested for correctness.""" rng = np.random.default_rng() X_2d = rng.standard_normal((n_instances, n_timepoints)) nested_df = from_2d_array_to_nested(X_2d) assert is_nested_dataframe(nested_df) assert nested_df.shape == (n_instances, 1)
def check_panel_to_panel_transform_univariate(Estimator): n_instances = 5 out = _construct_fit_transform(Estimator, n_instances=n_instances) assert isinstance(out, (pd.DataFrame, np.ndarray)) assert out.shape[0] == n_instances if isinstance(out, np.ndarray): assert out.ndim == 3 if isinstance(out, pd.DataFrame): assert is_nested_dataframe(out)
def test_from_long_to_nested(n_instances, n_columns, n_timepoints): """Test from_long_to_nested for correctness.""" X_long = generate_example_long_table( num_cases=n_instances, series_len=n_timepoints, num_dims=n_columns ) nested_df = from_long_to_nested(X_long) assert is_nested_dataframe(nested_df) assert nested_df.shape == (n_instances, n_columns)
def test_from_3d_numpy_to_nested(n_instances, n_columns, n_timepoints): """Test from_3d_numpy_to_nested for correctness.""" array = np.random.normal(size=(n_instances, n_columns, n_timepoints)) nested = from_3d_numpy_to_nested(array) # check types and shapes assert is_nested_dataframe(nested) assert nested.shape == (n_instances, n_columns) assert nested.iloc[0, 0].shape[0] == n_timepoints # check values of random series np.testing.assert_array_equal(nested.iloc[1, 0], array[1, 0, :])
def test_from_multi_index_to_nested(n_instances, n_columns, n_timepoints): """Test from_multi_index_to_nested for correctness.""" mi_df = make_multi_index_dataframe( n_instances=n_instances, n_timepoints=n_timepoints, n_columns=n_columns ) nested_df = from_multi_index_to_nested( mi_df, instance_index="case_id", cells_as_numpy=False ) assert is_nested_dataframe(nested_df) assert nested_df.shape == (n_instances, n_columns) assert (nested_df.columns == mi_df.columns).all()
def check_panel_to_panel_transform_multivariate(Estimator): n_instances = 5 if Estimator.get_class_tag("univariate-only", False): _check_raises_error(Estimator, n_instances=n_instances, n_columns=3) else: out = _construct_fit_transform(Estimator, n_instances=n_instances, n_columns=3) assert isinstance(out, (pd.DataFrame, np.ndarray)) assert out.shape[0] == n_instances if isinstance(out, np.ndarray): assert out.ndim == 3 if isinstance(out, pd.DataFrame): assert is_nested_dataframe(out)
def from_nested_to_3d_numpy(X): """Convert nested Panel to 3D numpy Panel. Convert nested pandas DataFrame (with time series as pandas Series in cells) into NumPy ndarray with shape (n_instances, n_columns, n_timepoints). Parameters ---------- X : pd.DataFrame Nested pandas DataFrame Returns ------- X_3d : np.ndarrray 3-dimensional NumPy array """ # n_instances, n_columns = X.shape # n_timepoints = X.iloc[0, 0].shape[0] # array = np.empty((n_instances, n_columns, n_timepoints)) # for column in range(n_columns): # array[:, column, :] = X.iloc[:, column].tolist() # return array if not is_nested_dataframe(X): raise ValueError("Input DataFrame is not a nested DataFrame") # n_columns = X.shape[1] nested_col_mask = [*are_columns_nested(X)] # If all the columns are nested in structure if nested_col_mask.count(True) == len(nested_col_mask): X_3d = np.stack( X.applymap(_convert_series_cell_to_numpy) .apply(lambda row: np.stack(row), axis=1) .to_numpy() ) # If some columns are primitive (non-nested) then first convert to # multi-indexed DataFrame where the same value of these columns is # repeated for each timepoint # Then the multi-indexed DataFrame can be converted to 3d NumPy array else: X_mi = from_nested_to_multi_index(X) X_3d = from_multi_index_to_3d_numpy( X_mi, instance_index="instance", time_index="timepoints" ) return X_3d
def _compare_nested_frame(func, x, y, **kwargs): """Compare two nested pd.DataFrames. Parameters ---------- func : function Function from np.testing for comparing arrays. x : pd.DataFrame y : pd.DataFrame kwargs : dict Keyword argument for function Raises ------ AssertionError If x and y are not equal """ # We iterate over columns and rows to make cell-wise comparisons. # Tabularizing the data first would simplify this, but does not # work for unequal length data. # In rare cases, x and y may be empty (e.g. TSFreshRelevantFeatureExtractor) and # we cannot compare individual cells, so we simply check if everything else is # equal here. assert isinstance(x, pd.DataFrame) if x.empty: assert_frame_equal(x, y) elif is_nested_dataframe(x): # Check if both inputs have the same shape if not x.shape == y.shape: raise ValueError("Found inputs with different shapes") # Iterate over columns n_columns = x.shape[1] for i in range(n_columns): xc = x.iloc[:, i].tolist() yc = y.iloc[:, i].tolist() # Iterate over rows, checking if individual cells are equal for xci, yci in zip(xc, yc): func(xci, yci, **kwargs)
def from_nested_to_multi_index(X, instance_index=None, time_index=None): """Convert nested pandas Panel to multi-index pandas Panel. Converts nested pandas DataFrame (with time series as pandas Series or NumPy array in cells) into multi-indexed pandas DataFrame. Can convert mixed nested and primitive DataFrame to multi-index DataFrame. Parameters ---------- X : pd.DataFrame The nested DataFrame to convert to a multi-indexed pandas DataFrame instance_index : str Name of the multi-index level corresponding to the DataFrame's instances time_index : str Name of multi-index level corresponding to DataFrame's timepoints Returns ------- X_mi : pd.DataFrame The multi-indexed pandas DataFrame """ if not is_nested_dataframe(X): raise ValueError("Input DataFrame is not a nested DataFrame") if time_index is None: time_index_name = "timepoints" else: time_index_name = time_index # n_columns = X.shape[1] nested_col_mask = [*are_columns_nested(X)] if instance_index is None: instance_idxs = X.index.get_level_values(-1).unique() # n_instances = instance_idxs.shape[0] instance_index_name = "instance" else: if instance_index in X.index.names: instance_idxs = X.index.get_level_values(instance_index).unique() else: instance_idxs = X.index.get_level_values(-1).unique() # n_instances = instance_idxs.shape[0] instance_index_name = instance_index instances = [] for instance_idx in instance_idxs: iidx = instance_idx series = [i[1] for i in X.loc[iidx, :].iteritems()] colnames = [i[0] for i in X.loc[iidx, :].iteritems()] for x in series: x.name = None instance = [ pd.DataFrame(s, columns=[c]) for s, c in zip(series, colnames) ] instance = pd.concat(instance, axis=1) # For primitive (non-nested column) assume the same # primitive value applies to every timepoint of the instance for col_idx, is_nested in enumerate(nested_col_mask): if not is_nested: instance.iloc[:, col_idx] = instance.iloc[:, col_idx].ffill() # Correctly assign multi-index multi_index = pd.MultiIndex.from_product( [[instance_idx], instance.index], names=[instance_index_name, time_index_name], ) instance.index = multi_index instances.append(instance) X_mi = pd.concat(instances) X_mi.columns = X.columns return X_mi
def check_X( X, enforce_univariate=False, enforce_min_instances=1, enforce_min_columns=1, coerce_to_numpy=False, coerce_to_pandas=False, ): """Validate input data. Parameters ---------- X : pd.DataFrame or np.array Input data enforce_univariate : bool, optional (default=False) Enforce that X is univariate. enforce_min_instances : int, optional (default=1) Enforce minimum number of instances. enforce_min_columns : int, optional (default=1) Enforce minimum number of columns (or time-series variables). coerce_to_numpy : bool, optional (default=False) If True, X will be coerced to a 3-dimensional numpy array. coerce_to_pandas : bool, optional (default=False) If True, X will be coerced to a nested pandas DataFrame. Returns ------- X : pd.DataFrame or np.array Checked and possibly converted input data Raises ------ ValueError If X is invalid input data """ # check input type if coerce_to_pandas and coerce_to_numpy: raise ValueError( "`coerce_to_pandas` and `coerce_to_numpy` cannot both be set to True" ) if not isinstance(X, VALID_X_TYPES): raise ValueError(f"X must be a pd.DataFrame or a np.array, " f"but found: {type(X)}") # check np.array # check first if we have the right number of dimensions, otherwise we # may not be able to get the shape of the second dimension below if isinstance(X, np.ndarray): if not X.ndim == 3: raise ValueError( f"If passed as a np.array, X must be a 3-dimensional " f"array, but found shape: {X.shape}") if coerce_to_pandas: X = from_3d_numpy_to_nested(X) # enforce minimum number of columns n_columns = X.shape[1] if n_columns < enforce_min_columns: raise ValueError( f"X must contain at least: {enforce_min_columns} columns, " f"but found only: {n_columns}.") # enforce univariate data if enforce_univariate and n_columns > 1: raise ValueError( f"X must be univariate with X.shape[1] == 1, but found: " f"X.shape[1] == {n_columns}.") # enforce minimum number of instances if enforce_min_instances > 0: _enforce_min_instances(X, min_instances=enforce_min_instances) # check pd.DataFrame if isinstance(X, pd.DataFrame): if not is_nested_dataframe(X): raise ValueError( "If passed as a pd.DataFrame, X must be a nested " "pd.DataFrame, with pd.Series or np.arrays inside cells.") # convert pd.DataFrame if coerce_to_numpy: X = from_nested_to_3d_numpy(X) return X