def _map(pdf: pd.DataFrame) -> pd.DataFrame: if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), df.schema, pandas_df_wrapper=True) cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas()
def test_nested(): #data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] #df = PandasDataFrame(data, "a:{a:str,b:[int]}") #a = df.as_array(type_safe=True) #assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[json.dumps(dict(b=[30, "40"]))]]] df = PandasDataFrame(data, "a:[{a:str,b:[int]}]") a = df.as_array(type_safe=True) assert [[[dict(a=None, b=[30, 40])]]] == a
def head(self, n: int, columns: Optional[List[str]] = None) -> List[Any]: """Get first n rows of the dataframe as 2-dimensional array :param n: number of rows :param columns: selected columns, defaults to None (all columns) :return: 2-dimensional array """ tdf = PandasDataFrame(self.native.head(n, compute=True, npartitions=-1), schema=self.schema) return tdf.head(n, columns=columns)
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: if partition_spec.num_partitions != "0": self.log.warning( "%s doesn't respect num_partitions %s", self, partition_spec.num_partitions, ) cursor = partition_spec.get_cursor(df.schema, 0) if on_init is not None: on_init(0, df) if len(partition_spec.partition_by) == 0: # no partition df = to_local_df(df) cursor.set(df.peek_array(), 0, 0) output_df = map_func(cursor, df) if (isinstance(output_df, PandasDataFrame) and output_df.schema != output_schema): output_df = PandasDataFrame(output_df.native, output_schema) assert_or_throw( output_df.schema == output_schema, lambda: f"map output {output_df.schema} " f"mismatches given {output_schema}", ) output_df._metadata = ParamDict(metadata, deep=True) output_df._metadata.set_readonly() return self.to_df(output_df) presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) def _map(pdf: pd.DataFrame) -> pd.DataFrame: if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), df.schema, pandas_df_wrapper=True) cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() result = self.pl_utils.safe_groupby_apply(df.as_pandas(), partition_spec.partition_by, _map) return PandasDataFrame(result, output_schema, metadata)
def test_simple_methods(): df = PandasDataFrame([], "a:str,b:int") assert df.as_pandas() is df.native assert df.empty assert 0 == df.count() assert df.is_local df = PandasDataFrame([["a", 1], ["b", "2"]], "x:str,y:double") assert df.as_pandas() is df.native assert not df.empty assert 2 == df.count() assert ["a", 1.0] == df.peek_array() assert dict(x="a", y=1.0) == df.peek_dict()
def _map(pdf: Any) -> pd.DataFrame: if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas()
def fillna( self, df: DataFrame, value: Any, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (not isinstance(value, list)) and (value is not None), ValueError("fillna value can not None or a list"), ) if isinstance(value, dict): assert_or_throw( (None not in value.values()) and (any(value.values())), ValueError( "fillna dict can not contain None and needs at least one value" ), ) mapping = value else: # If subset is none, apply to all columns subset = subset or df.schema.names mapping = {col: value for col in subset} d = df.as_pandas().fillna(mapping, inplace=False) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def distinct( self, df: DataFrame, metadata: Any = None, ) -> DataFrame: d = self.pl_utils.drop_duplicates(df.as_pandas()) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def test_map_in_pandas(self): if not hasattr(ps.DataFrame, "mapInPandas"): return def add(cursor, data): assert isinstance(data, LocalDataFrameIterableDataFrame) def get_dfs(): for df in data.native: pdf = df.as_pandas() pdf["zz"] = pdf["xx"] + pdf["yy"] yield PandasDataFrame(pdf) return LocalDataFrameIterableDataFrame(get_dfs()) e = self.engine np.random.seed(0) df = pd.DataFrame(np.random.randint(0, 5, (100000, 2)), columns=["xx", "yy"]) expected = PandasDataFrame(df.assign(zz=df.xx + df.yy), "xx:int,yy:int,zz:int") a = e.to_df(df) # no partition c = e.map(a, add, "xx:int,yy:int,zz:int", PartitionSpec(num=16)) df_eq(c, expected, throw=True)
def _udf( dfs: Iterable[pd.DataFrame], ) -> Iterable[pd.DataFrame]: # pragma: no cover def get_dfs() -> Iterable[LocalDataFrame]: for df in dfs: if df.shape[0] > 0: yield PandasDataFrame( df.reset_index(drop=True), input_schema, pandas_df_wrapper=True, ) input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema) if input_df.empty: return PandasDataFrame([], output_schema).as_pandas() if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) if isinstance(output_df, LocalDataFrameIterableDataFrame): for res in output_df.native: yield res.as_pandas() else: yield output_df.as_pandas()
def as_local(self) -> LocalDataFrame: # TODO: does it make sense to also include the metadata? if any(pa.types.is_nested(t) for t in self.schema.types): data = list(to_type_safe_input(self.native.collect(), self.schema)) return ArrayDataFrame(data, self.schema, self.metadata) return PandasDataFrame(self.native.toPandas(), self.schema, self.metadata)
def as_array(self, columns: Optional[List[str]] = None, type_safe: bool = False) -> List[Any]: df: DataFrame = self if columns is not None: df = df[columns] return PandasDataFrame(df.as_pandas(), schema=df.schema).as_array(type_safe=type_safe)
def transform(self, df: LocalDataFrame) -> LocalDataFrame: assert 1 == self.on_init_called assert "test" in self.workflow_conf assert "x" in df.metadata pdf = df.as_pandas() pdf["p"] = self.params.get("p", 1) pdf["ct"] = pdf.shape[0] return PandasDataFrame(pdf, self.output_schema)
def get_dfs() -> Iterable[LocalDataFrame]: for df in dfs: if df.shape[0] > 0: yield PandasDataFrame( df.reset_index(drop=True), input_schema, pandas_df_wrapper=True, )
def test_nan_none(): df = ArrayDataFrame([[None, None]], "b:str,c:double") assert df.as_pandas().iloc[0, 0] is None arr = PandasDataFrame(df.as_pandas(), df.schema).as_array()[0] assert arr[0] is None assert math.isnan(arr[1]) df = ArrayDataFrame([[None, None]], "b:int,c:bool") arr = PandasDataFrame(df.as_pandas(), df.schema).as_array(type_safe=True)[0] assert arr[0] is None assert arr[1] is None df = ArrayDataFrame([["a", 1.1], [None, None]], "b:str,c:double") arr = PandasDataFrame(df.as_pandas(), df.schema).as_array(type_safe=True)[1] assert arr[0] is None assert arr[1] is None
def select(self, dfs: DataFrames, statement: str) -> DataFrame: sql_engine = create_engine("sqlite:///:memory:") for k, v in dfs.items(): v.as_pandas().to_sql(k, sql_engine, if_exists="replace", index=False) df = pd.read_sql_query(statement, sql_engine) return PandasDataFrame(df)
def _test_as_array_perf(): s = Schema() arr = [] for i in range(100): s.append(f"a{i}:int") arr.append(i) for i in range(100): s.append(f"b{i}:int") arr.append(float(i)) for i in range(100): s.append(f"c{i}:str") arr.append(str(i)) data = [] for i in range(5000): data.append(list(arr)) df = PandasDataFrame(data, s) res = df.as_array() res = df.as_array(type_safe=True) nts, ts = 0.0, 0.0 for i in range(10): t = datetime.now() res = df.as_array() nts += (datetime.now() - t).total_seconds() t = datetime.now() res = df.as_array(type_safe=True) ts += (datetime.now() - t).total_seconds() print(nts, ts)
def union( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw(df1.schema == df2.schema, ValueError(f"{df1.schema} != {df2.schema}")) d = self.pl_utils.union(df1.as_pandas(), df2.as_pandas(), unique=distinct) return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
def dropna( self, df: DataFrame, how: str = "any", thresh: int = None, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: d = df.as_pandas().dropna(axis=0, how=how, thresh=thresh, subset=subset, inplace=False) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join(df1.as_pandas(), df2.as_pandas(), join_type=how, on=key_schema.names) return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata)
def intersect( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( distinct, NotImplementedError("INTERSECT ALL for NativeExecutionEngine")) assert_or_throw(df1.schema == df2.schema, ValueError(f"{df1.schema} != {df2.schema}")) d = self.pl_utils.intersect(df1.as_pandas(), df2.as_pandas(), unique=distinct) return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
def load_df( uri: Union[str, List[str]], format_hint: Optional[str] = None, columns: Any = None, fs: Optional[FileSystem] = None, **kwargs: Any, ) -> LocalBoundedDataFrame: if isinstance(uri, str): fp = [FileParser(uri, format_hint)] else: fp = [FileParser(u, format_hint) for u in uri] dfs: List[pd.DataFrame] = [] schema: Any = None for f in fp: df, schema = _FORMAT_LOAD[f.file_format](f, columns, **kwargs) dfs.append(df) return PandasDataFrame(pd.concat(dfs), schema)
def test_init(): df = IterableDataFrame(schema="a:str,b:int") assert df.empty assert df.schema == "a:str,b:int" assert not df.is_bounded data = [["a", 1], ["b", 2]] df = IterableDataFrame(data, "a:str,b:str") assert [["a", "1"], ["b", "2"]] == df.as_array(type_safe=True) assert df.empty # after iterating all items df = IterableDataFrame(data, "a:str,b:int") assert [["a", 1], ["b", 2]] == df.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df) assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, "a:str,b:float64") assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, "b:str,a:str") assert [["1", "a"], ["2", "b"]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, ["b"]) assert ddf.schema == "b:double" assert [[1.0], [2.0]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, ["a:str,b:str"]) assert [["a", "1"], ["b", "2"]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, ["b:str"]) assert [["1"], ["2"]] == ddf.as_array(type_safe=True) pdf = PandasDataFrame(data, "a:str,b:double") df = IterableDataFrame(pdf, "a:str,b:double") assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True) df = IterableDataFrame(pdf, "b:str,a:str") assert [["1.0", "a"], ["2.0", "b"]] == df.as_array(type_safe=True) df = IterableDataFrame([], "x:str,y:double") assert df.empty assert df.is_local raises(FugueDataFrameInitError, lambda: IterableDataFrame(123))
def __init__( # noqa: C901 self, df: Any = None, schema: Any = None, metadata: Any = None, num_partitions: int = 0, type_safe=True, ): try: if num_partitions <= 0: num_partitions = FUGUE_DASK_DEFAULT_CONF[ FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS] if df is None: schema = _input_schema(schema).assert_not_empty() df = [] if isinstance(df, DaskDataFrame): super().__init__(df.schema, df.metadata if metadata is None else metadata) self._native: pd.DataFrame = df._native return elif isinstance(df, (pd.DataFrame, pd.Series)): if isinstance(df, pd.Series): df = df.to_frame() pdf = df schema = None if schema is None else _input_schema(schema) elif isinstance(df, (pandas.DataFrame, pandas.Series)): if isinstance(df, pandas.Series): df = df.to_frame() pdf = pd.from_pandas(df, npartitions=num_partitions, sort=False) schema = None if schema is None else _input_schema(schema) elif isinstance(df, Iterable): schema = _input_schema(schema).assert_not_empty() t = PandasDataFrame(df, schema) pdf = pd.from_pandas(t.native, npartitions=num_partitions, sort=False) type_safe = False else: raise ValueError(f"{df} is incompatible with DaskDataFrame") pdf, schema = self._apply_schema(pdf, schema, type_safe) super().__init__(schema, metadata) self._native = pdf except Exception as e: raise FugueDataFrameInitError from e
def subtract( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( distinct, NotImplementedError("EXCEPT ALL for NativeExecutionEngine")) assert_or_throw( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) d = self.pl_utils.except_df(df1.as_pandas(), df2.as_pandas(), unique=distinct) return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
def sample( self, df: DataFrame, n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, seed: Optional[int] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (n is None and frac is not None) or (n is not None and frac is None), ValueError("one and only one of n and frac should be set"), ) d = df.as_pandas().sample(n=n, frac=frac, replace=replace, random_state=seed) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def test_to_df(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 3]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.native.collect() assert res[0][0] == 1.0 or res[0][0] is None assert res[1][0] == 1.0 or res[1][0] is None df_eq(a, o, throw=True) o = ArrowDataFrame( [[1, 2], [None, 3]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.native.collect() assert res[0][0] == 1.0 or res[0][0] is None assert res[1][0] == 1.0 or res[1][0] is None a = e.to_df([[1, None]], "a:int,b:int", dict(a=1)) df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True) o = PandasDataFrame( [[{ "a": "b" }, 2]], "a:{a:str},b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.as_array(type_safe=True) assert res[0][0] == {"a": "b"}
def take( self, df: DataFrame, n: int, presort: str, na_position: str = "last", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, metadata: Any = None, ) -> DataFrame: assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) d = df.as_pandas() # Use presort over partition_spec.presort if possible if presort: presort = parse_presort_exp(presort) _presort: IndexedOrderedDict = presort or partition_spec.presort if len(_presort.keys()) > 0: d = d.sort_values( list(_presort.keys()), ascending=list(_presort.values()), na_position=na_position, ) if len(partition_spec.partition_by) == 0: d = d.head(n) else: d = d.groupby(by=partition_spec.partition_by, dropna=False).head(n) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata, pandas_df_wrapper=True)
def dfs(): for df in output: yield PandasDataFrame(df, schema)
def to_output_df(self, output: pd.DataFrame, schema: Any) -> DataFrame: return PandasDataFrame(output, schema)