Пример #1
0
def test_compare_should_not_side_effect():
    _timeseries_df2 = timeseries_df2.copy()
    pp.compare(
        df=_timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.DIFF,
    )
    assert _timeseries_df2.equals(timeseries_df2)
    def test_compare(self):
        # `difference` comparison
        post_df = proc.compare(
            df=timeseries_df2,
            source_columns=["y"],
            compare_columns=["z"],
            compare_type="difference",
        )
        self.assertListEqual(
            post_df.columns.tolist(), ["label", "y", "z", "difference__y__z",]
        )
        self.assertListEqual(
            series_to_list(post_df["difference__y__z"]), [0.0, -2.0, -8.0, -6.0],
        )

        # drop original columns
        post_df = proc.compare(
            df=timeseries_df2,
            source_columns=["y"],
            compare_columns=["z"],
            compare_type="difference",
            drop_original_columns=True,
        )
        self.assertListEqual(post_df.columns.tolist(), ["label", "difference__y__z",])

        # `percentage` comparison
        post_df = proc.compare(
            df=timeseries_df2,
            source_columns=["y"],
            compare_columns=["z"],
            compare_type="percentage",
        )
        self.assertListEqual(
            post_df.columns.tolist(), ["label", "y", "z", "percentage__y__z",]
        )
        self.assertListEqual(
            series_to_list(post_df["percentage__y__z"]), [0.0, -0.5, -0.8, -0.75],
        )

        # `ratio` comparison
        post_df = proc.compare(
            df=timeseries_df2,
            source_columns=["y"],
            compare_columns=["z"],
            compare_type="ratio",
        )
        self.assertListEqual(
            post_df.columns.tolist(), ["label", "y", "z", "ratio__y__z",]
        )
        self.assertListEqual(
            series_to_list(post_df["ratio__y__z"]), [1.0, 0.5, 0.2, 0.25],
        )
Пример #3
0
def test_compare_percentage():
    # `percentage` comparison
    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.PCT,
    )
    """
               label    y     z  percentage__y__z
    2019-01-01     x  2.0   2.0              0.0
    2019-01-02     y  2.0   4.0              -0.50
    2019-01-05     z  2.0  10.0              -0.80
    2019-01-07     q  2.0   8.0              -0.75
    """
    assert post_df.equals(
        pd.DataFrame(
            index=timeseries_df2.index,
            data={
                "label": ["x", "y", "z", "q"],
                "y": [2.0, 2.0, 2.0, 2.0],
                "z": [2.0, 4.0, 10.0, 8.0],
                "percentage__y__z": [0.0, -0.50, -0.80, -0.75],
            },
        ))
Пример #4
0
def test_compare_ratio():
    # `ratio` comparison
    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.RAT,
    )
    """
               label    y     z  ratio__y__z
    2019-01-01     x  2.0   2.0         1.00
    2019-01-02     y  2.0   4.0         0.50
    2019-01-05     z  2.0  10.0         0.20
    2019-01-07     q  2.0   8.0         0.25
    """
    assert post_df.equals(
        pd.DataFrame(
            index=timeseries_df2.index,
            data={
                "label": ["x", "y", "z", "q"],
                "y": [2.0, 2.0, 2.0, 2.0],
                "z": [2.0, 4.0, 10.0, 8.0],
                "ratio__y__z": [1.00, 0.50, 0.20, 0.25],
            },
        ))
Пример #5
0
def test_compare_after_pivot():
    pivot_df = pp.pivot(
        df=multiple_metrics_df,
        index=["dttm"],
        columns=["country"],
        aggregates={
            "sum_metric": {
                "operator": "sum"
            },
            "count_metric": {
                "operator": "sum"
            },
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
                   count_metric    sum_metric
    country              UK US         UK US
    dttm
    2019-01-01            1  2          5  6
    2019-01-02            3  4          7  8
    """
    compared_df = pp.compare(
        pivot_df,
        source_columns=["count_metric"],
        compare_columns=["sum_metric"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    """
               difference__count_metric__sum_metric
    country                                      UK US
    dttm
    2019-01-01                                   -4 -4
    2019-01-02                                   -4 -4
    """
    flat_df = pp.flatten(compared_df)
    """
            dttm  difference__count_metric__sum_metric, UK  difference__count_metric__sum_metric, US
    0 2019-01-01                                        -4                                        -4
    1 2019-01-02                                        -4                                        -4
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "dttm":
                pd.to_datetime(["2019-01-01", "2019-01-02"]),
                FLAT_COLUMN_SEPARATOR.join([
                    "difference__count_metric__sum_metric", "UK"
                ]): [-4, -4],
                FLAT_COLUMN_SEPARATOR.join([
                    "difference__count_metric__sum_metric", "US"
                ]): [-4, -4],
            }))
Пример #6
0
def test_compare():
    # `difference` comparison
    post_df = compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type="difference",
    )
    assert post_df.columns.tolist() == ["label", "y", "z", "difference__y__z"]
    assert series_to_list(
        post_df["difference__y__z"]) == [0.0, -2.0, -8.0, -6.0]

    # drop original columns
    post_df = compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type="difference",
        drop_original_columns=True,
    )
    assert post_df.columns.tolist() == ["label", "difference__y__z"]

    # `percentage` comparison
    post_df = compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type="percentage",
    )
    assert post_df.columns.tolist() == ["label", "y", "z", "percentage__y__z"]
    assert series_to_list(
        post_df["percentage__y__z"]) == [0.0, -0.5, -0.8, -0.75]

    # `ratio` comparison
    post_df = compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type="ratio",
    )
    assert post_df.columns.tolist() == ["label", "y", "z", "ratio__y__z"]
    assert series_to_list(post_df["ratio__y__z"]) == [1.0, 0.5, 0.2, 0.25]
Пример #7
0
def test_compare_diff():
    # `difference` comparison
    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.DIFF,
    )
    """
               label    y     z  difference__y__z
    2019-01-01     x  2.0   2.0               0.0
    2019-01-02     y  2.0   4.0              -2.0
    2019-01-05     z  2.0  10.0              -8.0
    2019-01-07     q  2.0   8.0              -6.0
    """
    assert post_df.equals(
        pd.DataFrame(
            index=timeseries_df2.index,
            data={
                "label": ["x", "y", "z", "q"],
                "y": [2.0, 2.0, 2.0, 2.0],
                "z": [2.0, 4.0, 10.0, 8.0],
                "difference__y__z": [0.0, -2.0, -8.0, -6.0],
            },
        ))

    # drop original columns
    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    assert post_df.equals(
        pd.DataFrame(
            index=timeseries_df2.index,
            data={
                "label": ["x", "y", "z", "q"],
                "difference__y__z": [0.0, -2.0, -8.0, -6.0],
            },
        ))
Пример #8
0
def test_compare_multi_index_column():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    iterables = [["m1", "m2"], ["a", "b"], ["x", "y"]]
    columns = pd.MultiIndex.from_product(iterables,
                                         names=[None, "level1", "level2"])
    df = pd.DataFrame(index=index, columns=columns, data=1)
    """
                m1          m2
    level1       a     b     a     b
    level2       x  y  x  y  x  y  x  y
    __timestamp
    2021-01-01   1  1  1  1  1  1  1  1
    2021-01-02   1  1  1  1  1  1  1  1
    2021-01-03   1  1  1  1  1  1  1  1
    """
    post_df = pp.compare(
        df,
        source_columns=["m1"],
        compare_columns=["m2"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    flat_df = pp.flatten(post_df)
    """
      __timestamp  difference__m1__m2, a, x  difference__m1__m2, a, y  difference__m1__m2, b, x  difference__m1__m2, b, y
    0  2021-01-01                         0                         0                         0                         0
    1  2021-01-02                         0                         0                         0                         0
    2  2021-01-03                         0                         0                         0                         0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "__timestamp":
                pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]),
                "difference__m1__m2, a, x": [0, 0, 0],
                "difference__m1__m2, a, y": [0, 0, 0],
                "difference__m1__m2, b, x": [0, 0, 0],
                "difference__m1__m2, b, y": [0, 0, 0],
            }))