示例#1
0
def test_rolling_after_pivot_with_multiple_metrics():
    pivot_df = pp.pivot(
        df=multiple_metrics_df,
        index=["dttm"],
        columns=["country"],
        aggregates={
            "sum_metric": {
                "operator": "sum"
            },
            "count_metric": {
                "operator": "sum"
            },
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
               count_metric    sum_metric
    country              UK US         UK US
    dttm
    2019-01-01            1  2          5  6
    2019-01-02            3  4          7  8
    """
    rolling_df = pp.rolling(
        df=pivot_df,
        columns={
            "count_metric": "count_metric",
            "sum_metric": "sum_metric",
        },
        rolling_type="sum",
        window=2,
        min_periods=0,
    )
    """
               count_metric      sum_metric
    country              UK   US         UK    US
    dttm
    2019-01-01          1.0  2.0        5.0   6.0
    2019-01-02          4.0  6.0       12.0  14.0
    """
    flat_df = pp.flatten(rolling_df)
    """
            dttm  count_metric, UK  count_metric, US  sum_metric, UK  sum_metric, US
    0 2019-01-01               1.0               2.0             5.0             6.0
    1 2019-01-02               4.0               6.0            12.0            14.0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
                FLAT_COLUMN_SEPARATOR.join(["count_metric", "UK"]): [1.0, 4.0],
                FLAT_COLUMN_SEPARATOR.join(["count_metric", "US"]): [2.0, 6.0],
                FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5.0, 12.0],
                FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6.0, 14.0],
            }))
示例#2
0
def test_compare_after_pivot():
    pivot_df = pp.pivot(
        df=multiple_metrics_df,
        index=["dttm"],
        columns=["country"],
        aggregates={
            "sum_metric": {
                "operator": "sum"
            },
            "count_metric": {
                "operator": "sum"
            },
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
                   count_metric    sum_metric
    country              UK US         UK US
    dttm
    2019-01-01            1  2          5  6
    2019-01-02            3  4          7  8
    """
    compared_df = pp.compare(
        pivot_df,
        source_columns=["count_metric"],
        compare_columns=["sum_metric"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    """
               difference__count_metric__sum_metric
    country                                      UK US
    dttm
    2019-01-01                                   -4 -4
    2019-01-02                                   -4 -4
    """
    flat_df = pp.flatten(compared_df)
    """
            dttm  difference__count_metric__sum_metric, UK  difference__count_metric__sum_metric, US
    0 2019-01-01                                        -4                                        -4
    1 2019-01-02                                        -4                                        -4
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "dttm":
                pd.to_datetime(["2019-01-01", "2019-01-02"]),
                FLAT_COLUMN_SEPARATOR.join([
                    "difference__count_metric__sum_metric", "UK"
                ]): [-4, -4],
                FLAT_COLUMN_SEPARATOR.join([
                    "difference__count_metric__sum_metric", "US"
                ]): [-4, -4],
            }))
示例#3
0
def test_cum_after_pivot_with_multiple_metrics():
    pivot_df = pp.pivot(
        df=multiple_metrics_df,
        index=["dttm"],
        columns=["country"],
        aggregates={
            "sum_metric": {
                "operator": "sum"
            },
            "count_metric": {
                "operator": "sum"
            },
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
                   count_metric    sum_metric
    country              UK US         UK US
    dttm
    2019-01-01            1  2          5  6
    2019-01-02            3  4          7  8
    """
    cum_df = pp.cum(
        df=pivot_df,
        operator="sum",
        columns={
            "sum_metric": "sum_metric",
            "count_metric": "count_metric"
        },
    )
    """
               count_metric    sum_metric
    country              UK US         UK  US
    dttm
    2019-01-01            1  2          5   6
    2019-01-02            4  6         12  14
    """
    flat_df = pp.flatten(cum_df)
    """
            dttm  count_metric, UK  count_metric, US  sum_metric, UK  sum_metric, US
    0 2019-01-01                 1                 2               5               6
    1 2019-01-02                 4                 6              12              14
    """
    assert flat_df.equals(
        pd.DataFrame({
            "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
            FLAT_COLUMN_SEPARATOR.join(["count_metric", "UK"]): [1, 4],
            FLAT_COLUMN_SEPARATOR.join(["count_metric", "US"]): [2, 6],
            FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5, 12],
            FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6, 14],
        }))
示例#4
0
def test_flat_should_flat_multiple_index():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    iterables = [["foo", "bar"], [1, "two"]]
    columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"])
    df = pd.DataFrame(index=index, columns=columns, data=1)

    assert pp.flatten(df).equals(
        pd.DataFrame({
            "__timestamp": index,
            FLAT_COLUMN_SEPARATOR.join(["foo", "1"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["foo", "two"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["bar", "1"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["bar", "two"]): [1, 1, 1],
        }))
示例#5
0
def test_rolling_after_pivot_with_single_metric():
    pivot_df = pp.pivot(
        df=single_metric_df,
        index=["dttm"],
        columns=["country"],
        aggregates={"sum_metric": {
            "operator": "sum"
        }},
    )
    """
                   sum_metric
    country            UK US
    dttm
    2019-01-01          5  6
    2019-01-02          7  8
    """
    rolling_df = pp.rolling(
        df=pivot_df,
        columns={"sum_metric": "sum_metric"},
        rolling_type="sum",
        window=2,
        min_periods=0,
    )
    """
               sum_metric
    country            UK    US
    dttm
    2019-01-01        5.0   6.0
    2019-01-02       12.0  14.0
    """
    flat_df = pp.flatten(rolling_df)
    """
            dttm  sum_metric, UK  sum_metric, US
    0 2019-01-01             5.0             6.0
    1 2019-01-02            12.0            14.0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
                FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5.0, 12.0],
                FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6.0, 14.0],
            }))
示例#6
0
def test_cum_after_pivot_with_single_metric():
    pivot_df = pp.pivot(
        df=single_metric_df,
        index=["dttm"],
        columns=["country"],
        aggregates={"sum_metric": {
            "operator": "sum"
        }},
        flatten_columns=False,
        reset_index=False,
    )
    """
               sum_metric
    country            UK US
    dttm
    2019-01-01          5  6
    2019-01-02          7  8
    """
    cum_df = pp.cum(df=pivot_df,
                    operator="sum",
                    columns={"sum_metric": "sum_metric"})
    """
               sum_metric
    country            UK  US
    dttm
    2019-01-01          5   6
    2019-01-02         12  14
    """
    cum_and_flat_df = pp.flatten(cum_df)
    """
                dttm  sum_metric, UK  sum_metric, US
    0 2019-01-01               5               6
    1 2019-01-02              12              14
    """
    assert cum_and_flat_df.equals(
        pd.DataFrame({
            "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
            FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5, 12],
            FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6, 14],
        }))
示例#7
0
def test_flat_should_drop_index_level():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    columns = pd.MultiIndex.from_arrays(
        [["a"] * 3, ["b"] * 3, ["c", "d", "e"], ["ff", "ii", "gg"]],
        names=["level1", "level2", "level3", "level4"],
    )
    df = pd.DataFrame(index=index, columns=columns, data=1)

    # drop level by index
    assert pp.flatten(df.copy(), drop_levels=(
        0,
        1,
    )).equals(
        pd.DataFrame({
            "__timestamp": index,
            FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1],
        }))

    # drop level by name
    assert pp.flatten(df.copy(), drop_levels=("level1", "level2")).equals(
        pd.DataFrame({
            "__timestamp": index,
            FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1],
        }))

    # only leave 1 level
    assert pp.flatten(df.copy(), drop_levels=(0, 1, 2)).equals(
        pd.DataFrame({
            "__timestamp": index,
            FLAT_COLUMN_SEPARATOR.join(["ff"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["ii"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["gg"]): [1, 1, 1],
        }))
示例#8
0
def flatten(
    df: pd.DataFrame,
    reset_index: bool = True,
    drop_levels: Union[Sequence[int], Sequence[str]] = (),
) -> pd.DataFrame:
    """
    Convert N-dimensional DataFrame to a flat DataFrame

    :param df: N-dimensional DataFrame.
    :param reset_index: Convert index to column when df.index isn't RangeIndex
    :param drop_levels: index of level or names of level might be dropped
                        if df is N-dimensional
    :return: a flat DataFrame

    Examples
    -----------

    Convert DatetimeIndex into columns.

    >>> index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03",])
    >>> index.name = "__timestamp"
    >>> df = pd.DataFrame(index=index, data={"metric": [1, 2, 3]})
    >>> df
                 metric
    __timestamp
    2021-01-01        1
    2021-01-02        2
    2021-01-03        3
    >>> df = flatten(df)
    >>> df
      __timestamp  metric
    0  2021-01-01       1
    1  2021-01-02       2
    2  2021-01-03       3

    Convert DatetimeIndex and MultipleIndex into columns

    >>> iterables = [["foo", "bar"], ["one", "two"]]
    >>> columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"])
    >>> df = pd.DataFrame(index=index, columns=columns, data=1)
    >>> df
    level1      foo     bar
    level2      one two one two
    __timestamp
    2021-01-01    1   1   1   1
    2021-01-02    1   1   1   1
    2021-01-03    1   1   1   1
    >>> flatten(df)
      __timestamp foo, one foo, two bar, one bar, two
    0  2021-01-01        1        1        1        1
    1  2021-01-02        1        1        1        1
    2  2021-01-03        1        1        1        1
    """
    if _is_multi_index_on_columns(df):
        df.columns = df.columns.droplevel(drop_levels)
        _columns = []
        for series in df.columns.to_flat_index():
            _cells = []
            for cell in series if is_sequence(series) else [series]:
                if pd.notnull(cell):
                    # every cell should be converted to string
                    _cells.append(str(cell))
            _columns.append(FLAT_COLUMN_SEPARATOR.join(_cells))

        df.columns = _columns

    if reset_index and not isinstance(df.index, pd.RangeIndex):
        df = df.reset_index(level=0)
    return df