示例#1
0
def test_safe_group_by_apply():
    df = DF([["a", 1], ["a", 2], [None, 3]], "b:str,c:long", True)

    def _m1(df):
        PD_UTILS.ensure_compatible(df)
        df["ct"] = df.shape[0]
        return df

    res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 3 == res.shape[0]
    assert 3 == res.shape[1]
    assert [["a", 1, 2], ["a", 2, 2], [None, 3, 1]] == res.values.tolist()

    res = PD_UTILS.safe_groupby_apply(df.native, [], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 3 == res.shape[0]
    assert 3 == res.shape[1]
    assert [["a", 1, 3], ["a", 2, 3], [None, 3, 3]] == res.values.tolist()

    df = DF([[1.0, "a"], [1.0, "b"], [None, "c"], [None, "d"]],
            "b:double,c:str", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1)
    assert [
        [1.0, "a", 2],
        [1.0, "b", 2],
        [float("nan"), "c", 2],
        [float("nan"), "d", 2],
    ].__repr__() == res.values.tolist().__repr__()
示例#2
0
def test_safe_group_by_apply_special_types():
    def _m1(df):
        PD_UTILS.ensure_compatible(df)
        df["ct"] = df.shape[0]
        return df

    df = DF([["a", 1.0], [None, 3.0], [None, 3.0], [None, None]],
            "a:str,b:double", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", 1.0, 1], [None, 3.0, 2], [None, 3.0, 2], [None, None, 1]],
        "a:str,b:double,ct:int",
        True,
    ).assert_eq(res)

    dt = datetime.now()
    df = DF([["a", dt], [None, dt], [None, dt], [None, None]],
            "a:str,b:datetime", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]],
        "a:str,b:datetime,ct:int",
        True,
    ).assert_eq(res)

    dt = date(2020, 1, 1)
    df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:date",
            True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]],
        "a:str,b:date,ct:int",
        True,
    ).assert_eq(res)

    dt = date(2020, 1, 1)
    df = DF([["a", dt], ["b", dt], ["b", dt], ["b", None]], "a:str,b:date",
            True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], ["b", dt, 2], ["b", dt, 2], ["b", None, 1]],
        "a:str,b:date,ct:int",
        True,
    ).assert_eq(res)
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        if partition_spec.num_partitions != "0":
            self.log.warning(
                f"{self} doesn't respect num_partitions {partition_spec.num_partitions}"
            )
        cursor = partition_spec.get_cursor(df.schema, 0)
        if on_init is not None:
            on_init(0, df)
        if len(partition_spec.partition_by) == 0:  # no partition
            df = to_local_df(df)
            cursor.set(df.peek_array(), 0, 0)
            output_df = map_func(cursor, df)
            assert_or_throw(
                output_df.schema == output_schema,
                f"map output {output_df.schema} mismatches given {output_schema}",
            )
            output_df._metadata = ParamDict(metadata, deep=True)
            output_df._metadata.set_readonly()
            return self.to_df(output_df)
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)

        def _map(pdf: pd.DataFrame) -> pd.DataFrame:
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       df.schema,
                                       pandas_df_wrapper=True)
            cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        result = PD_UTILS.safe_groupby_apply(df.as_pandas(),
                                             partition_spec.partition_by, _map)
        return PandasDataFrame(result, output_schema, metadata)