def append(self, obj: Any) -> "Schema": # noqa: C901 """Append schema like object to the current schema. Only new columns are allowed. :raises SchemaError: if a column exists or is invalid or obj is not convertible :return: the Schema object itself """ try: if obj is None: return self elif isinstance(obj, pa.Field): self[obj.name] = obj.type elif isinstance(obj, str): self._append_pa_schema(expression_to_schema(obj)) elif isinstance(obj, Dict): for k, v in obj.items(): self[k] = v elif isinstance(obj, pa.Schema): self._append_pa_schema(obj) elif isinstance(obj, pd.DataFrame): self._append_pa_schema(PD_UTILS.to_schema(obj)) elif isinstance(obj, Tuple): # type: ignore self[obj[0]] = obj[1] elif isinstance(obj, List): for x in obj: self.append(x) else: raise SchemaError(f"Invalid schema to add {obj}") return self except SchemaError: raise except Exception as e: raise SchemaError(str(e))
def as_array(self, cols=None, type_safe=False, null_schema=False): schema = None if null_schema else self.schema return list( PD_UTILS.as_array_iterable(self.native, schema=schema, columns=cols, type_safe=type_safe))
def _apply_schema(self, pdf: pd.DataFrame, schema: Optional[Schema]) -> Tuple[pd.DataFrame, Schema]: PD_UTILS.ensure_compatible(pdf) if pdf.columns.dtype == "object": # pdf has named schema pschema = _input_schema(pdf) if schema is None or pschema == schema: return pdf, pschema.assert_not_empty() pdf = pdf[schema.assert_not_empty().names] else: # pdf has no named schema schema = _input_schema(schema).assert_not_empty() assert_or_throw( pdf.shape[1] == len(schema), ValueError( f"Pandas datafame column count doesn't match {schema}"), ) pdf.columns = schema.names return _enforce_type(pdf, schema), schema
def test_safe_group_by_apply(): df = DF([["a", 1], ["a", 2], [None, 3]], "b:str,c:long", True) def _m1(df): PD_UTILS.ensure_compatible(df) df["ct"] = df.shape[0] return df res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1) PD_UTILS.ensure_compatible(res) assert 3 == res.shape[0] assert 3 == res.shape[1] assert [["a", 1, 2], ["a", 2, 2], [None, 3, 1]] == res.values.tolist() res = PD_UTILS.safe_groupby_apply(df.native, [], _m1) PD_UTILS.ensure_compatible(res) assert 3 == res.shape[0] assert 3 == res.shape[1] assert [["a", 1, 3], ["a", 2, 3], [None, 3, 3]] == res.values.tolist() df = DF([[1.0, "a"], [1.0, "b"], [None, "c"], [None, "d"]], "b:double,c:str", True) res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1) assert [ [1.0, "a", 2], [1.0, "b", 2], [float("nan"), "c", 2], [float("nan"), "d", 2], ].__repr__() == res.values.tolist().__repr__()
def as_array_iterable(self, columns: Optional[List[str]] = None, type_safe: bool = False) -> Iterable[Any]: for row in PD_UTILS.as_array_iterable( self.native, schema=self.schema.pa_schema, columns=columns, type_safe=type_safe, ): yield row
def __init__( # noqa: C901 self, df: Any = None, schema: Any = None, metadata: Any = None, pandas_df_wrapper: bool = False, ): try: apply_schema = True if df is None: schema = _input_schema(schema).assert_not_empty() df = [] if isinstance(df, PandasDataFrame): # TODO: This is useless if in this way and wrong pdf = df.native schema = None elif isinstance(df, (pd.DataFrame, pd.Series)): if isinstance(df, pd.Series): df = df.to_frame() pdf = df schema = None if schema is None else _input_schema(schema) if pandas_df_wrapper and schema is not None: apply_schema = False elif isinstance(df, Iterable): schema = _input_schema(schema).assert_not_empty() pdf = pd.DataFrame(df, columns=schema.names) pdf = PD_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True) if PD_UTILS.empty(pdf): for k, v in schema.items(): pdf[k] = pdf[k].astype(v.type.to_pandas_dtype()) apply_schema = False else: raise ValueError(f"{df} is incompatible with PandasDataFrame") if apply_schema: pdf, schema = self._apply_schema(pdf, schema) super().__init__(schema, metadata) self._native = pdf except Exception as e: raise FugueDataFrameInitError from e
def test_fillna_default(): df = pd.DataFrame([["a"], [None]], columns=["x"]) s = PD_UTILS.fillna_default(df["x"]) assert ["a", 0] == s.tolist() df = pd.DataFrame([["a"], ["b"]], columns=["x"]) s = PD_UTILS.fillna_default(df["x"].astype(np.str)) assert ["a", "b"] == s.tolist() dt = datetime.now() df = pd.DataFrame([[dt], [None]], columns=["x"]) s = PD_UTILS.fillna_default(df["x"]) assert [dt, _DEFAULT_DATETIME] == s.tolist() df = pd.DataFrame([[True], [None]], columns=["x"]) s = PD_UTILS.fillna_default(df["x"]) assert [True, 0] == s.tolist() df = pd.DataFrame([[True], [False]], columns=["x"]) s = PD_UTILS.fillna_default(df["x"].astype(bool)) assert [True, False] == s.tolist()
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: if partition_spec.num_partitions != "0": self.log.warning( f"{self} doesn't respect num_partitions {partition_spec.num_partitions}" ) cursor = partition_spec.get_cursor(df.schema, 0) if on_init is not None: on_init(0, df) if len(partition_spec.partition_by) == 0: # no partition df = to_local_df(df) cursor.set(df.peek_array(), 0, 0) output_df = map_func(cursor, df) assert_or_throw( output_df.schema == output_schema, f"map output {output_df.schema} mismatches given {output_schema}", ) output_df._metadata = ParamDict(metadata, deep=True) output_df._metadata.set_readonly() return self.to_df(output_df) presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) def _map(pdf: pd.DataFrame) -> pd.DataFrame: if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), df.schema, pandas_df_wrapper=True) cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() result = PD_UTILS.safe_groupby_apply(df.as_pandas(), partition_spec.partition_by, _map) return PandasDataFrame(result, output_schema, metadata)
def as_pandas(self) -> pd.DataFrame: """Convert to pandas DataFrame""" pdf = pd.DataFrame(self.as_array(), columns=self.schema.names) return PD_UTILS.enforce_type(pdf, self.schema.pa_schema, null_safe=True)
def to_df(self, df: Any, schema: Any = None, metadata: Any = None) -> SparkDataFrame: """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType` defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`, it should return itself * For :class:`~spark:pyspark.RDD`, list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError( "schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, SparkDataFrame): return df if isinstance(df, ArrowDataFrame): sdf = self.spark_session.createDataFrame( df.as_array(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, (ArrayDataFrame, IterableDataFrame)): adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema) sdf = self.spark_session.createDataFrame( adf.as_array(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if any(pa.types.is_struct(t) for t in df.schema.types): sdf = self.spark_session.createDataFrame( df.as_array(type_safe=True), to_spark_schema(df.schema)) else: sdf = self.spark_session.createDataFrame( df.as_pandas(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, ps.DataFrame): return SparkDataFrame( df, None if schema is None else to_schema(schema), metadata) if isinstance(df, RDD): assert_arg_not_none(schema, "schema") sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) return SparkDataFrame(sdf, to_schema(schema), metadata) if isinstance(df, pd.DataFrame): if PD_UTILS.empty(df): temp_schema = to_spark_schema(PD_UTILS.to_schema(df)) sdf = self.spark_session.createDataFrame([], temp_schema) else: sdf = self.spark_session.createDataFrame(df) return SparkDataFrame(sdf, schema, metadata) # use arrow dataframe here to handle nulls in int cols assert_or_throw(schema is not None, FugueDataFrameInitError("schema can't be None")) adf = ArrowDataFrame(df, to_schema(schema)) sdf = self.spark_session.createDataFrame(adf.as_array(), to_spark_schema(adf.schema)) return SparkDataFrame(sdf, adf.schema, metadata)
def __init__(self, data, schema, enforce=False): s = expression_to_schema(schema) df = pd.DataFrame(data, columns=s.names) self.native = PD_UTILS.enforce_type(df, s, enforce) self.schema = s
def _m1(df): PD_UTILS.ensure_compatible(df) df["ct"] = df.shape[0] return df
def test_safe_group_by_apply_special_types(): def _m1(df): PD_UTILS.ensure_compatible(df) df["ct"] = df.shape[0] return df df = DF([["a", 1.0], [None, 3.0], [None, 3.0], [None, None]], "a:str,b:double", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", 1.0, 1], [None, 3.0, 2], [None, 3.0, 2], [None, None, 1]], "a:str,b:double,ct:int", True, ).assert_eq(res) dt = datetime.now() df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:datetime", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]], "a:str,b:datetime,ct:int", True, ).assert_eq(res) dt = date(2020, 1, 1) df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:date", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]], "a:str,b:date,ct:int", True, ).assert_eq(res) dt = date(2020, 1, 1) df = DF([["a", dt], ["b", dt], ["b", dt], ["b", None]], "a:str,b:date", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], ["b", dt, 2], ["b", dt, 2], ["b", None, 1]], "a:str,b:date,ct:int", True, ).assert_eq(res)
def test_to_schema(): df = pd.DataFrame([[1.0, 2], [2.0, 3]]) raises(ValueError, lambda: PD_UTILS.to_schema(df)) df = pd.DataFrame([[1.0, 2], [2.0, 3]], columns=["x", "y"]) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([["a", 2], ["b", 3]], columns=["x", "y"]) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([], columns=["x", "y"]) df = df.astype(dtype={"x": np.int32, "y": np.dtype("object")}) assert [pa.field("x", pa.int32()), pa.field("y", pa.string())] == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"]) df = df.astype(dtype={"x": np.int32, "y": np.dtype("object")}) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"]) df = df.astype(dtype={"x": np.int32, "y": np.dtype(str)}) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"]) df = df.astype(dtype={"x": np.int32, "y": np.dtype("str")}) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) # test index df = pd.DataFrame([[3.0, 2], [2.0, 3]], columns=["x", "y"]) df = df.sort_values(["x"]) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df.index.name = "x" raises(ValueError, lambda: PD_UTILS.to_schema(df)) df = df.reset_index(drop=True) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df["p"] = "p" df = df.set_index(["p"]) df.index.name = None raises(ValueError, lambda: PD_UTILS.to_schema(df))