def test_rolling_after_pivot_with_multiple_metrics(): pivot_df = pp.pivot( df=multiple_metrics_df, index=["dttm"], columns=["country"], aggregates={ "sum_metric": { "operator": "sum" }, "count_metric": { "operator": "sum" }, }, flatten_columns=False, reset_index=False, ) """ count_metric sum_metric country UK US UK US dttm 2019-01-01 1 2 5 6 2019-01-02 3 4 7 8 """ rolling_df = pp.rolling( df=pivot_df, columns={ "count_metric": "count_metric", "sum_metric": "sum_metric", }, rolling_type="sum", window=2, min_periods=0, ) """ count_metric sum_metric country UK US UK US dttm 2019-01-01 1.0 2.0 5.0 6.0 2019-01-02 4.0 6.0 12.0 14.0 """ flat_df = pp.flatten(rolling_df) """ dttm count_metric, UK count_metric, US sum_metric, UK sum_metric, US 0 2019-01-01 1.0 2.0 5.0 6.0 1 2019-01-02 4.0 6.0 12.0 14.0 """ assert flat_df.equals( pd.DataFrame( data={ "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]), FLAT_COLUMN_SEPARATOR.join(["count_metric", "UK"]): [1.0, 4.0], FLAT_COLUMN_SEPARATOR.join(["count_metric", "US"]): [2.0, 6.0], FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5.0, 12.0], FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6.0, 14.0], }))
def test_compare_after_pivot(): pivot_df = pp.pivot( df=multiple_metrics_df, index=["dttm"], columns=["country"], aggregates={ "sum_metric": { "operator": "sum" }, "count_metric": { "operator": "sum" }, }, flatten_columns=False, reset_index=False, ) """ count_metric sum_metric country UK US UK US dttm 2019-01-01 1 2 5 6 2019-01-02 3 4 7 8 """ compared_df = pp.compare( pivot_df, source_columns=["count_metric"], compare_columns=["sum_metric"], compare_type=PPC.DIFF, drop_original_columns=True, ) """ difference__count_metric__sum_metric country UK US dttm 2019-01-01 -4 -4 2019-01-02 -4 -4 """ flat_df = pp.flatten(compared_df) """ dttm difference__count_metric__sum_metric, UK difference__count_metric__sum_metric, US 0 2019-01-01 -4 -4 1 2019-01-02 -4 -4 """ assert flat_df.equals( pd.DataFrame( data={ "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]), FLAT_COLUMN_SEPARATOR.join([ "difference__count_metric__sum_metric", "UK" ]): [-4, -4], FLAT_COLUMN_SEPARATOR.join([ "difference__count_metric__sum_metric", "US" ]): [-4, -4], }))
def test_cum_after_pivot_with_multiple_metrics(): pivot_df = pp.pivot( df=multiple_metrics_df, index=["dttm"], columns=["country"], aggregates={ "sum_metric": { "operator": "sum" }, "count_metric": { "operator": "sum" }, }, flatten_columns=False, reset_index=False, ) """ count_metric sum_metric country UK US UK US dttm 2019-01-01 1 2 5 6 2019-01-02 3 4 7 8 """ cum_df = pp.cum( df=pivot_df, operator="sum", columns={ "sum_metric": "sum_metric", "count_metric": "count_metric" }, ) """ count_metric sum_metric country UK US UK US dttm 2019-01-01 1 2 5 6 2019-01-02 4 6 12 14 """ flat_df = pp.flatten(cum_df) """ dttm count_metric, UK count_metric, US sum_metric, UK sum_metric, US 0 2019-01-01 1 2 5 6 1 2019-01-02 4 6 12 14 """ assert flat_df.equals( pd.DataFrame({ "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]), FLAT_COLUMN_SEPARATOR.join(["count_metric", "UK"]): [1, 4], FLAT_COLUMN_SEPARATOR.join(["count_metric", "US"]): [2, 6], FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5, 12], FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6, 14], }))
def test_flat_should_flat_multiple_index(): index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) index.name = "__timestamp" iterables = [["foo", "bar"], [1, "two"]] columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"]) df = pd.DataFrame(index=index, columns=columns, data=1) assert pp.flatten(df).equals( pd.DataFrame({ "__timestamp": index, FLAT_COLUMN_SEPARATOR.join(["foo", "1"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["foo", "two"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["bar", "1"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["bar", "two"]): [1, 1, 1], }))
def test_rolling_after_pivot_with_single_metric(): pivot_df = pp.pivot( df=single_metric_df, index=["dttm"], columns=["country"], aggregates={"sum_metric": { "operator": "sum" }}, ) """ sum_metric country UK US dttm 2019-01-01 5 6 2019-01-02 7 8 """ rolling_df = pp.rolling( df=pivot_df, columns={"sum_metric": "sum_metric"}, rolling_type="sum", window=2, min_periods=0, ) """ sum_metric country UK US dttm 2019-01-01 5.0 6.0 2019-01-02 12.0 14.0 """ flat_df = pp.flatten(rolling_df) """ dttm sum_metric, UK sum_metric, US 0 2019-01-01 5.0 6.0 1 2019-01-02 12.0 14.0 """ assert flat_df.equals( pd.DataFrame( data={ "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]), FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5.0, 12.0], FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6.0, 14.0], }))
def test_cum_after_pivot_with_single_metric(): pivot_df = pp.pivot( df=single_metric_df, index=["dttm"], columns=["country"], aggregates={"sum_metric": { "operator": "sum" }}, flatten_columns=False, reset_index=False, ) """ sum_metric country UK US dttm 2019-01-01 5 6 2019-01-02 7 8 """ cum_df = pp.cum(df=pivot_df, operator="sum", columns={"sum_metric": "sum_metric"}) """ sum_metric country UK US dttm 2019-01-01 5 6 2019-01-02 12 14 """ cum_and_flat_df = pp.flatten(cum_df) """ dttm sum_metric, UK sum_metric, US 0 2019-01-01 5 6 1 2019-01-02 12 14 """ assert cum_and_flat_df.equals( pd.DataFrame({ "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]), FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5, 12], FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6, 14], }))
def test_flat_should_drop_index_level(): index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) index.name = "__timestamp" columns = pd.MultiIndex.from_arrays( [["a"] * 3, ["b"] * 3, ["c", "d", "e"], ["ff", "ii", "gg"]], names=["level1", "level2", "level3", "level4"], ) df = pd.DataFrame(index=index, columns=columns, data=1) # drop level by index assert pp.flatten(df.copy(), drop_levels=( 0, 1, )).equals( pd.DataFrame({ "__timestamp": index, FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1], })) # drop level by name assert pp.flatten(df.copy(), drop_levels=("level1", "level2")).equals( pd.DataFrame({ "__timestamp": index, FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1], })) # only leave 1 level assert pp.flatten(df.copy(), drop_levels=(0, 1, 2)).equals( pd.DataFrame({ "__timestamp": index, FLAT_COLUMN_SEPARATOR.join(["ff"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["ii"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["gg"]): [1, 1, 1], }))
def flatten( df: pd.DataFrame, reset_index: bool = True, drop_levels: Union[Sequence[int], Sequence[str]] = (), ) -> pd.DataFrame: """ Convert N-dimensional DataFrame to a flat DataFrame :param df: N-dimensional DataFrame. :param reset_index: Convert index to column when df.index isn't RangeIndex :param drop_levels: index of level or names of level might be dropped if df is N-dimensional :return: a flat DataFrame Examples ----------- Convert DatetimeIndex into columns. >>> index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03",]) >>> index.name = "__timestamp" >>> df = pd.DataFrame(index=index, data={"metric": [1, 2, 3]}) >>> df metric __timestamp 2021-01-01 1 2021-01-02 2 2021-01-03 3 >>> df = flatten(df) >>> df __timestamp metric 0 2021-01-01 1 1 2021-01-02 2 2 2021-01-03 3 Convert DatetimeIndex and MultipleIndex into columns >>> iterables = [["foo", "bar"], ["one", "two"]] >>> columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"]) >>> df = pd.DataFrame(index=index, columns=columns, data=1) >>> df level1 foo bar level2 one two one two __timestamp 2021-01-01 1 1 1 1 2021-01-02 1 1 1 1 2021-01-03 1 1 1 1 >>> flatten(df) __timestamp foo, one foo, two bar, one bar, two 0 2021-01-01 1 1 1 1 1 2021-01-02 1 1 1 1 2 2021-01-03 1 1 1 1 """ if _is_multi_index_on_columns(df): df.columns = df.columns.droplevel(drop_levels) _columns = [] for series in df.columns.to_flat_index(): _cells = [] for cell in series if is_sequence(series) else [series]: if pd.notnull(cell): # every cell should be converted to string _cells.append(str(cell)) _columns.append(FLAT_COLUMN_SEPARATOR.join(_cells)) df.columns = _columns if reset_index and not isinstance(df.index, pd.RangeIndex): df = df.reset_index(level=0) return df