def test_safe_group_by_apply(): df = DF([["a", 1], ["a", 2], [None, 3]], "b:str,c:long", True) def _m1(df): PD_UTILS.ensure_compatible(df) df["ct"] = df.shape[0] return df res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1) PD_UTILS.ensure_compatible(res) assert 3 == res.shape[0] assert 3 == res.shape[1] assert [["a", 1, 2], ["a", 2, 2], [None, 3, 1]] == res.values.tolist() res = PD_UTILS.safe_groupby_apply(df.native, [], _m1) PD_UTILS.ensure_compatible(res) assert 3 == res.shape[0] assert 3 == res.shape[1] assert [["a", 1, 3], ["a", 2, 3], [None, 3, 3]] == res.values.tolist() df = DF([[1.0, "a"], [1.0, "b"], [None, "c"], [None, "d"]], "b:double,c:str", True) res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1) assert [ [1.0, "a", 2], [1.0, "b", 2], [float("nan"), "c", 2], [float("nan"), "d", 2], ].__repr__() == res.values.tolist().__repr__()
def test_safe_group_by_apply_special_types(): def _m1(df): PD_UTILS.ensure_compatible(df) df["ct"] = df.shape[0] return df df = DF([["a", 1.0], [None, 3.0], [None, 3.0], [None, None]], "a:str,b:double", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", 1.0, 1], [None, 3.0, 2], [None, 3.0, 2], [None, None, 1]], "a:str,b:double,ct:int", True, ).assert_eq(res) dt = datetime.now() df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:datetime", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]], "a:str,b:datetime,ct:int", True, ).assert_eq(res) dt = date(2020, 1, 1) df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:date", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]], "a:str,b:date,ct:int", True, ).assert_eq(res) dt = date(2020, 1, 1) df = DF([["a", dt], ["b", dt], ["b", dt], ["b", None]], "a:str,b:date", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], ["b", dt, 2], ["b", dt, 2], ["b", None, 1]], "a:str,b:date,ct:int", True, ).assert_eq(res)
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: if partition_spec.num_partitions != "0": self.log.warning( f"{self} doesn't respect num_partitions {partition_spec.num_partitions}" ) cursor = partition_spec.get_cursor(df.schema, 0) if on_init is not None: on_init(0, df) if len(partition_spec.partition_by) == 0: # no partition df = to_local_df(df) cursor.set(df.peek_array(), 0, 0) output_df = map_func(cursor, df) assert_or_throw( output_df.schema == output_schema, f"map output {output_df.schema} mismatches given {output_schema}", ) output_df._metadata = ParamDict(metadata, deep=True) output_df._metadata.set_readonly() return self.to_df(output_df) presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) def _map(pdf: pd.DataFrame) -> pd.DataFrame: if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), df.schema, pandas_df_wrapper=True) cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() result = PD_UTILS.safe_groupby_apply(df.as_pandas(), partition_spec.partition_by, _map) return PandasDataFrame(result, output_schema, metadata)