def _parse_schema(self, obj: Any, dfs: DataFrames) -> Schema: if callable(obj): return obj(dfs, **self.params) if isinstance(obj, str): return Schema(obj) if isinstance(obj, List): s = Schema() for x in obj: s += self._parse_schema(x, dfs) return s return Schema(obj)
def to_validation_rules(data: Dict[str, Any]) -> Dict[str, Any]: res: Dict[str, Any] = {} for k, v in data.items(): if k in ["partitionby_has", "partitionby_is"]: if isinstance(v, str): v = [x.strip() for x in v.split(",")] res[k] = PartitionSpec(by=v).partition_by elif k in ["presort_has", "presort_is"]: res[k] = list(parse_presort_exp(v).items()) elif k in ["input_has"]: if isinstance(v, str): res[k] = v.replace(" ", "").split(",") else: assert_or_throw( isinstance(v, list), lambda: SyntaxError(f"{v} is neither a string or a list"), ) res[k] = [x.replace(" ", "") for x in v] elif k in ["input_is"]: try: res[k] = str(Schema(v)) except SyntaxError: raise SyntaxError( # pylint: disable=W0707 f"for input_is, the input must be a schema expression {v}") else: raise NotImplementedError(k) return res
def _serialize_by_partition( self, df: DataFrame, partition_spec: PartitionSpec, df_name: str, temp_path: Optional[str] = None, to_file_threshold: Any = -1, has_name: bool = False, ) -> DataFrame: to_file_threshold = _get_file_threshold(to_file_threshold) on = list(filter(lambda k: k in df.schema, partition_spec.partition_by)) presort = list( filter(lambda p: p[0] in df.schema, partition_spec.presort.items()) ) col_name = _df_name_to_serialize_col(df_name) if len(on) == 0: partition_spec = PartitionSpec( partition_spec, num=1, by=[], presort=presort ) output_schema = Schema(f"{col_name}:str") else: partition_spec = PartitionSpec(partition_spec, by=on, presort=presort) output_schema = partition_spec.get_key_schema(df.schema) + f"{col_name}:str" s = _PartitionSerializer(output_schema, temp_path, to_file_threshold) metadata = dict( serialized=True, serialized_cols={df_name: col_name}, schemas={df_name: str(df.schema)}, serialized_has_name=has_name, ) return self.map(df, s.run, output_schema, partition_spec, metadata)
def __init__( self, data: Any, schema: Any = None, metadata: Any = None, deterministic: bool = True, data_determiner: Optional[Callable[[Any], str]] = None, lazy: bool = True, ): self._validate_data(data, schema, metadata) self._data = data self._schema = None if schema is None else Schema(schema) self._metadata = None if metadata is None else ParamDict(metadata) did = "" if data_determiner is None else data_determiner(data) super().__init__( params=dict( schema=self._schema, metadata=self._metadata, determinism_id=did, ), input_n=0, output_n=1, deterministic=deterministic, lazy=lazy, )
def correct_select_schema( self, input_schema: Schema, select: SelectColumns, output_schema: Schema ) -> Optional[Schema]: """Do partial schema inference from ``input_schema`` and ``select`` columns, then compare with the SQL output dataframe schema, and return the different part as a new schema, or None if there is no difference :param input_schema: input dataframe schema for the select statement :param select: the collection of select columns :param output_schema: schema of the output dataframe after executing the SQL :return: the difference as a new schema or None if no difference .. tip:: This is particularly useful when the SQL engine messed up the schema of the output. For example, ``SELECT *`` should return a dataframe with the same schema of the input. However, for example a column ``a:int`` could become ``a:long`` in the output dataframe because of information loss. This function is designed to make corrections on column types when they can be inferred. This may not be perfect but it can solve major discrepancies. """ cols = select.replace_wildcard(input_schema).assert_all_with_names() fields: List[pa.Field] = [] for c in cols.all_cols: tp = c.infer_type(input_schema) if tp is not None and tp != output_schema[c.output_name].type: fields.append(pa.field(c.output_name, tp)) if len(fields) == 0: return None return Schema(fields)
def __init__( # noqa: C901 self, df: Any = None, schema: Any = None, metadata: Any = None): try: if isinstance(df, Iterable): self._native = make_empty_aware(self._dfs_wrapper(df)) orig_schema: Optional[Schema] = None if not self._native.empty: orig_schema = self._native.peek().schema else: raise ValueError( f"{df} is incompatible with LocalDataFrameIterableDataFrame" ) if orig_schema is None and schema is None: raise FugueDataFrameInitError( "schema is not provided and the input is empty") elif orig_schema is None and schema is not None: pass elif orig_schema is not None and schema is None: schema = orig_schema else: schema = Schema(schema) if not isinstance(schema, Schema) else schema assert_or_throw( orig_schema == schema, lambda: f"iterable schema {orig_schema} is different from {schema}", ) super().__init__(schema, metadata) except FugueDataFrameError: raise except Exception as e: raise FugueDataFrameInitError from e
def test_correct_select_schema(): schema = Schema("a:double,b:str") gen = SQLExpressionGenerator() sc = SelectColumns(col("*"), col("c")) output = Schema("a:double,b:str,c:str") c = gen.correct_select_schema(schema, sc, output) assert c is None output = Schema("a:int,b:int,c:str") c = gen.correct_select_schema(schema, sc, output) assert c == "a:double,b:str" sc = SelectColumns(f.count(col("*")).alias("t"), col("c").alias("a")) output = Schema("t:int,a:str") c = gen.correct_select_schema(schema, sc, output) assert c is None sc = SelectColumns((col("a") + col("b")).cast(str).alias("a"), lit(1, "c")) output = Schema("a:int,c:str") c = gen.correct_select_schema(schema, sc, output) assert c == "a:str,c:long"
def _map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: df = self.to_df(self.repartition(df, partition_spec)) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _udf( dfs: Iterable[pd.DataFrame], ) -> Iterable[pd.DataFrame]: # pragma: no cover def get_dfs() -> Iterable[LocalDataFrame]: for df in dfs: if df.shape[0] > 0: yield PandasDataFrame( df.reset_index(drop=True), input_schema, pandas_df_wrapper=True, ) input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema) if input_df.empty: return PandasDataFrame([], output_schema).as_pandas() if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) if isinstance(output_df, LocalDataFrameIterableDataFrame): for res in output_df.native: yield res.as_pandas() else: yield output_df.as_pandas() df = self.to_df(df) sdf = df.native.mapInPandas(_udf, schema=to_spark_schema(output_schema)) return SparkDataFrame(sdf, metadata=metadata)
def test_schema(): a = Schema( "a:bool,b:int8,c:uint8,d:int16,e:uint16,f:int32,g:uint32,h:int64,i:uint64" ) b = ibis.schema( [ ("a", "boolean"), ("b", "int8"), ("c", "uint8"), ("d", "int16"), ("e", "uint16"), ("f", "int32"), ("g", "uint32"), ("h", "int64"), ("i", "uint64"), ] ) assert to_ibis_schema(a) == b assert a == to_schema(b) a = Schema("a:float32,b:float64,c:datetime,d:date,e:binary,f:string") b = ibis.schema( [ ("a", "float32"), ("b", "float64"), ("c", "timestamp"), ("d", "date"), ("e", "binary"), ("f", "string"), ] ) assert to_ibis_schema(a) == b assert a == to_schema(b) a = Schema("a:[int],b:[{a:str}],c:{a:str},d:{a:[int]}") assert to_schema(to_ibis_schema(a)) == a
def __init__( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, on_init: Optional[Callable[[int, DataFrame], Any]], ): super().__init__() self.schema = df.schema self.output_schema = Schema(output_schema) self.metadata = df.metadata self.partition_spec = partition_spec self.map_func = map_func self.on_init = on_init
def _group_map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _udf(pdf: Any) -> pd.DataFrame: # pragma: no cover if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame( pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True ) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) gdf = df.native.groupBy(*partition_spec.partition_by) sdf = gdf.applyInPandas(_udf, schema=to_spark_schema(output_schema)) return SparkDataFrame(sdf, metadata=metadata)
def test_schema_inference(): schema = Schema("a:int,b:str,c:bool,d:double") assert pa.int32() == col("a").infer_type(schema) assert pa.int32() == (-col("a")).infer_type(schema) assert pa.int64() == (-col("a")).cast(int).infer_type(schema) assert pa.int64() == (-col("a").cast(int)).infer_type(schema) assert pa.string() == col("b").infer_type(schema) assert (-col("b")).infer_type(schema) is None assert (~col("b")).infer_type(schema) is None assert pa.bool_() == col("c").infer_type(schema) assert pa.bool_() == (~col("c")).alias("x").infer_type(schema) assert pa.float64() == col("d").infer_type(schema) assert pa.float64() == (-col("d").alias("x")).infer_type(schema) assert col("x").infer_type(schema) is None assert pa.string() == col("x").cast(str).infer_type(schema) assert col("*").infer_type(schema) is None assert pa.bool_() == (col("a") < col("d")).infer_type(schema) assert pa.bool_() == (col("a") > col("d")).infer_type(schema) assert pa.bool_() == (col("a") <= col("d")).infer_type(schema) assert pa.bool_() == (col("a") >= col("d")).infer_type(schema) assert pa.bool_() == (col("a") == col("d")).infer_type(schema) assert pa.bool_() == (col("a") != col("d")).infer_type(schema) assert pa.bool_() == (~(col("a") != col("d"))).infer_type(schema) assert pa.int64() == (~(col("a") != col("d"))).cast(int).infer_type(schema) assert (col("a") - col("d")).infer_type(schema) is None assert pa.int64() == lit(1).infer_type(schema) assert pa.string() == lit("a").infer_type(schema) assert pa.bool_() == lit(False).infer_type(schema) assert pa.string() == lit(False).cast(str).infer_type(schema) assert pa.float64() == lit(2.2).infer_type(schema) assert null().infer_type(schema) is None assert pa.string() == null().cast(str).infer_type(schema) assert function("a", col("a").cast("int")).infer_type(schema) is None assert pa.string() == function( "a", col("a").cast("int")).cast(str).infer_type(schema)
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: if (self.conf.get_or_throw(FUGUE_SPARK_CONF_USE_PANDAS_UDF, bool) and hasattr(ps.DataFrame, "mapInPandas") # new pyspark and not any( pa.types.is_nested(t) for t in Schema(output_schema).types)): # pandas udf can only be used for pyspark > 3 if len(partition_spec.partition_by ) > 0 and partition_spec.algo != "even": return self._group_map_by_pandas_udf( df, map_func=map_func, output_schema=output_schema, partition_spec=partition_spec, metadata=metadata, on_init=on_init, ) elif len(partition_spec.partition_by) == 0: return self._map_by_pandas_udf( df, map_func=map_func, output_schema=output_schema, partition_spec=partition_spec, metadata=metadata, on_init=on_init, ) df = self.to_df(self.repartition(df, partition_spec)) mapper = _Mapper(df, map_func, output_schema, partition_spec, on_init) sdf = df.native.rdd.mapPartitionsWithIndex(mapper.run, True) return self.to_df(sdf, output_schema, metadata)
def to_schema(schema: ibis.Schema) -> Schema: fields = [(n, _ibis_to_pa_type(t)) for n, t in zip(schema.names, schema.types)] return Schema(fields)
def test_select_columns(): # not all with names cols = SelectColumns(col("a"), lit(1, "b"), col("bb") + col("cc"), f.first(col("c"))) assert to_uuid(cols) == to_uuid(cols) raises(ValueError, lambda: cols.assert_all_with_names()) # distinct cols2 = SelectColumns( col("a"), lit(1, "b"), col("bb") + col("cc"), f.first(col("c")), arg_distinct=True, ) assert to_uuid(cols) != to_uuid(cols2) # duplicated names cols = SelectColumns(col("a").alias("b"), lit(1, "b")) assert to_uuid(cols) != to_uuid( SelectColumns(col("a").alias("b"), lit(1, "c"))) raises(ValueError, lambda: cols.assert_all_with_names()) # with *, all cols must have alias cols = SelectColumns(col("*"), col("a")).assert_no_agg() raises(ValueError, lambda: cols.assert_all_with_names()) # * can be used at most once raises(ValueError, lambda: SelectColumns(col("*"), col("*"), col("a").alias("p"))) # * can't be used with aggregation raises(ValueError, lambda: SelectColumns(col("*"), f.first(col("a")).alias("x"))) cols = SelectColumns( col("aa").alias("a").cast(int), lit(1, "b"), (col("bb") + col("cc")).alias("c"), f.first(col("c")).alias("d"), ).assert_all_with_names() raises(AssertionError, lambda: cols.assert_no_agg()) assert not cols.simple assert 1 == len(cols.simple_cols) assert "CAST(aa AS long) AS a" == str(cols.simple_cols[0]) assert cols.has_literals assert 1 == len(cols.literals) assert "1 AS b" == str(cols.literals[0]) assert cols.has_agg assert 1 == len(cols.non_agg_funcs) assert "+(bb,cc) AS c" == str(cols.non_agg_funcs[0]) assert 1 == len(cols.agg_funcs) assert "FIRST(c) AS d" == str(cols.agg_funcs[0]) assert 2 == len(cols.group_keys) # a, c assert "aa" == cols.group_keys[0].output_name assert "" == cols.group_keys[1].output_name assert isinstance(cols.group_keys[1], _BinaryOpExpr) cols = SelectColumns(col("a")).assert_no_wildcard() assert cols.simple assert not cols.has_literals assert not cols.has_agg cols = SelectColumns(col("x"), col("*"), col("y") + col("z")) cols = cols.replace_wildcard(Schema("a:int,b:int")) assert "x" == str(cols.all_cols[0])
def test_functions(): schema = Schema("a:int,b:str,c:bool,d:double") expr = f.coalesce(col("a"), 1, None, col("b") + col("c")) assert "COALESCE(a,1,NULL,+(b,c))" == str(expr) assert expr.infer_type(schema) is None expr = f.min(col("a")) assert "MIN(a)" == str(expr) assert pa.int32() == expr.infer_type(schema) assert "MIN(a) AS a" == str(expr.infer_alias()) assert "CAST(MIN(a) AS long) AS a" == str(expr.cast(int).infer_alias()) assert "MIN(a) AS b" == str(expr.alias("b").infer_alias()) assert "MIN(-(a)) AS a" == str(f.min(-col("a")).infer_alias()) expr = f.min(lit(1.1)) assert "MIN(1.1)" == str(expr) assert pa.float64() == expr.infer_type(schema) expr = f.max(col("a")) assert "MAX(a)" == str(expr) assert pa.int32() == expr.infer_type(schema) expr = f.max(lit(1.1)) assert "MAX(1.1)" == str(expr) assert pa.float64() == expr.infer_type(schema) expr = f.first(col("a")) assert "FIRST(a)" == str(expr) assert pa.int32() == expr.infer_type(schema) expr = f.first(lit(1.1)) assert "FIRST(1.1)" == str(expr) assert pa.float64() == expr.infer_type(schema) expr = f.last(col("a")) assert "LAST(a)" == str(expr) assert pa.int32() == expr.infer_type(schema) expr = f.last(lit(1.1)) assert "LAST(1.1)" == str(expr) assert pa.float64() == expr.infer_type(schema) expr = f.avg(col("a")) assert "AVG(a)" == str(expr) assert expr.infer_type(schema) is None expr = f.sum(col("a")) assert "SUM(a)" == str(expr) assert expr.infer_type(schema) is None expr = f.count(col("a")) assert "COUNT(a)" == str(expr) assert expr.infer_type(schema) is None expr = f.count_distinct(col("a")) assert "COUNT(DISTINCT a)" == str(expr) assert expr.infer_type(schema) is None assert "COUNT(DISTINCT a) AS a" == str(expr.infer_alias()) expr = f.count_distinct(col("*")) assert "COUNT(DISTINCT *)" == str(expr) assert expr.infer_type(schema) is None assert "COUNT(DISTINCT *)" == str(expr.infer_alias())