예제 #1
0
 def _parse_schema(self, obj: Any, dfs: DataFrames) -> Schema:
     if callable(obj):
         return obj(dfs, **self.params)
     if isinstance(obj, str):
         return Schema(obj)
     if isinstance(obj, List):
         s = Schema()
         for x in obj:
             s += self._parse_schema(x, dfs)
         return s
     return Schema(obj)
예제 #2
0
def to_validation_rules(data: Dict[str, Any]) -> Dict[str, Any]:
    res: Dict[str, Any] = {}
    for k, v in data.items():
        if k in ["partitionby_has", "partitionby_is"]:
            if isinstance(v, str):
                v = [x.strip() for x in v.split(",")]
            res[k] = PartitionSpec(by=v).partition_by
        elif k in ["presort_has", "presort_is"]:
            res[k] = list(parse_presort_exp(v).items())
        elif k in ["input_has"]:
            if isinstance(v, str):
                res[k] = v.replace(" ", "").split(",")
            else:
                assert_or_throw(
                    isinstance(v, list),
                    lambda: SyntaxError(f"{v} is neither a string or a list"),
                )
                res[k] = [x.replace(" ", "") for x in v]
        elif k in ["input_is"]:
            try:
                res[k] = str(Schema(v))
            except SyntaxError:
                raise SyntaxError(  # pylint: disable=W0707
                    f"for input_is, the input must be a schema expression {v}")
        else:
            raise NotImplementedError(k)
    return res
예제 #3
0
 def _serialize_by_partition(
     self,
     df: DataFrame,
     partition_spec: PartitionSpec,
     df_name: str,
     temp_path: Optional[str] = None,
     to_file_threshold: Any = -1,
     has_name: bool = False,
 ) -> DataFrame:
     to_file_threshold = _get_file_threshold(to_file_threshold)
     on = list(filter(lambda k: k in df.schema, partition_spec.partition_by))
     presort = list(
         filter(lambda p: p[0] in df.schema, partition_spec.presort.items())
     )
     col_name = _df_name_to_serialize_col(df_name)
     if len(on) == 0:
         partition_spec = PartitionSpec(
             partition_spec, num=1, by=[], presort=presort
         )
         output_schema = Schema(f"{col_name}:str")
     else:
         partition_spec = PartitionSpec(partition_spec, by=on, presort=presort)
         output_schema = partition_spec.get_key_schema(df.schema) + f"{col_name}:str"
     s = _PartitionSerializer(output_schema, temp_path, to_file_threshold)
     metadata = dict(
         serialized=True,
         serialized_cols={df_name: col_name},
         schemas={df_name: str(df.schema)},
         serialized_has_name=has_name,
     )
     return self.map(df, s.run, output_schema, partition_spec, metadata)
예제 #4
0
파일: _tasks.py 프로젝트: gityow/fugue
 def __init__(
     self,
     data: Any,
     schema: Any = None,
     metadata: Any = None,
     deterministic: bool = True,
     data_determiner: Optional[Callable[[Any], str]] = None,
     lazy: bool = True,
 ):
     self._validate_data(data, schema, metadata)
     self._data = data
     self._schema = None if schema is None else Schema(schema)
     self._metadata = None if metadata is None else ParamDict(metadata)
     did = "" if data_determiner is None else data_determiner(data)
     super().__init__(
         params=dict(
             schema=self._schema,
             metadata=self._metadata,
             determinism_id=did,
         ),
         input_n=0,
         output_n=1,
         deterministic=deterministic,
         lazy=lazy,
     )
예제 #5
0
파일: sql.py 프로젝트: gityow/fugue
    def correct_select_schema(
        self, input_schema: Schema, select: SelectColumns, output_schema: Schema
    ) -> Optional[Schema]:
        """Do partial schema inference from ``input_schema`` and ``select`` columns,
        then compare with the SQL output dataframe schema, and return the different
        part as a new schema, or None if there is no difference

        :param input_schema: input dataframe schema for the select statement
        :param select: the collection of select columns
        :param output_schema: schema of the output dataframe after executing the SQL
        :return: the difference as a new schema or None if no difference

        .. tip::

            This is particularly useful when the SQL engine messed up the schema of the
            output. For example, ``SELECT *`` should return a dataframe with the same
            schema of the input. However, for example a column ``a:int`` could become
            ``a:long`` in the output dataframe because of information loss. This
            function is designed to make corrections on column types when they can be
            inferred. This may not be perfect but it can solve major discrepancies.
        """
        cols = select.replace_wildcard(input_schema).assert_all_with_names()
        fields: List[pa.Field] = []
        for c in cols.all_cols:
            tp = c.infer_type(input_schema)
            if tp is not None and tp != output_schema[c.output_name].type:
                fields.append(pa.field(c.output_name, tp))
        if len(fields) == 0:
            return None
        return Schema(fields)
예제 #6
0
 def __init__(  # noqa: C901
         self,
         df: Any = None,
         schema: Any = None,
         metadata: Any = None):
     try:
         if isinstance(df, Iterable):
             self._native = make_empty_aware(self._dfs_wrapper(df))
             orig_schema: Optional[Schema] = None
             if not self._native.empty:
                 orig_schema = self._native.peek().schema
         else:
             raise ValueError(
                 f"{df} is incompatible with LocalDataFrameIterableDataFrame"
             )
         if orig_schema is None and schema is None:
             raise FugueDataFrameInitError(
                 "schema is not provided and the input is empty")
         elif orig_schema is None and schema is not None:
             pass
         elif orig_schema is not None and schema is None:
             schema = orig_schema
         else:
             schema = Schema(schema) if not isinstance(schema,
                                                       Schema) else schema
             assert_or_throw(
                 orig_schema == schema,
                 lambda:
                 f"iterable schema {orig_schema} is different from {schema}",
             )
         super().__init__(schema, metadata)
     except FugueDataFrameError:
         raise
     except Exception as e:
         raise FugueDataFrameInitError from e
예제 #7
0
def test_correct_select_schema():
    schema = Schema("a:double,b:str")
    gen = SQLExpressionGenerator()

    sc = SelectColumns(col("*"), col("c"))
    output = Schema("a:double,b:str,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c is None

    output = Schema("a:int,b:int,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c == "a:double,b:str"

    sc = SelectColumns(f.count(col("*")).alias("t"), col("c").alias("a"))
    output = Schema("t:int,a:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c is None

    sc = SelectColumns((col("a") + col("b")).cast(str).alias("a"), lit(1, "c"))
    output = Schema("a:int,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c == "a:str,c:long"
예제 #8
0
    def _map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        df = self.to_df(self.repartition(df, partition_spec))
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _udf(
            dfs: Iterable[pd.DataFrame],
        ) -> Iterable[pd.DataFrame]:  # pragma: no cover
            def get_dfs() -> Iterable[LocalDataFrame]:
                for df in dfs:
                    if df.shape[0] > 0:
                        yield PandasDataFrame(
                            df.reset_index(drop=True),
                            input_schema,
                            pandas_df_wrapper=True,
                        )

            input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema)
            if input_df.empty:
                return PandasDataFrame([], output_schema).as_pandas()
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            if isinstance(output_df, LocalDataFrameIterableDataFrame):
                for res in output_df.native:
                    yield res.as_pandas()
            else:
                yield output_df.as_pandas()

        df = self.to_df(df)
        sdf = df.native.mapInPandas(_udf, schema=to_spark_schema(output_schema))
        return SparkDataFrame(sdf, metadata=metadata)
예제 #9
0
파일: test_utils.py 프로젝트: gityow/fugue
def test_schema():
    a = Schema(
        "a:bool,b:int8,c:uint8,d:int16,e:uint16,f:int32,g:uint32,h:int64,i:uint64"
    )
    b = ibis.schema(
        [
            ("a", "boolean"),
            ("b", "int8"),
            ("c", "uint8"),
            ("d", "int16"),
            ("e", "uint16"),
            ("f", "int32"),
            ("g", "uint32"),
            ("h", "int64"),
            ("i", "uint64"),
        ]
    )
    assert to_ibis_schema(a) == b
    assert a == to_schema(b)

    a = Schema("a:float32,b:float64,c:datetime,d:date,e:binary,f:string")
    b = ibis.schema(
        [
            ("a", "float32"),
            ("b", "float64"),
            ("c", "timestamp"),
            ("d", "date"),
            ("e", "binary"),
            ("f", "string"),
        ]
    )
    assert to_ibis_schema(a) == b
    assert a == to_schema(b)

    a = Schema("a:[int],b:[{a:str}],c:{a:str},d:{a:[int]}")
    assert to_schema(to_ibis_schema(a)) == a
예제 #10
0
 def __init__(
     self,
     df: DataFrame,
     map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
     output_schema: Any,
     partition_spec: PartitionSpec,
     on_init: Optional[Callable[[int, DataFrame], Any]],
 ):
     super().__init__()
     self.schema = df.schema
     self.output_schema = Schema(output_schema)
     self.metadata = df.metadata
     self.partition_spec = partition_spec
     self.map_func = map_func
     self.on_init = on_init
예제 #11
0
    def _group_map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _udf(pdf: Any) -> pd.DataFrame:  # pragma: no cover
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(
                pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
            )
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)

        gdf = df.native.groupBy(*partition_spec.partition_by)
        sdf = gdf.applyInPandas(_udf, schema=to_spark_schema(output_schema))
        return SparkDataFrame(sdf, metadata=metadata)
예제 #12
0
def test_schema_inference():
    schema = Schema("a:int,b:str,c:bool,d:double")
    assert pa.int32() == col("a").infer_type(schema)
    assert pa.int32() == (-col("a")).infer_type(schema)
    assert pa.int64() == (-col("a")).cast(int).infer_type(schema)
    assert pa.int64() == (-col("a").cast(int)).infer_type(schema)
    assert pa.string() == col("b").infer_type(schema)
    assert (-col("b")).infer_type(schema) is None
    assert (~col("b")).infer_type(schema) is None
    assert pa.bool_() == col("c").infer_type(schema)
    assert pa.bool_() == (~col("c")).alias("x").infer_type(schema)
    assert pa.float64() == col("d").infer_type(schema)
    assert pa.float64() == (-col("d").alias("x")).infer_type(schema)
    assert col("x").infer_type(schema) is None
    assert pa.string() == col("x").cast(str).infer_type(schema)
    assert col("*").infer_type(schema) is None

    assert pa.bool_() == (col("a") < col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") > col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") <= col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") >= col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") == col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") != col("d")).infer_type(schema)
    assert pa.bool_() == (~(col("a") != col("d"))).infer_type(schema)
    assert pa.int64() == (~(col("a") != col("d"))).cast(int).infer_type(schema)

    assert (col("a") - col("d")).infer_type(schema) is None

    assert pa.int64() == lit(1).infer_type(schema)
    assert pa.string() == lit("a").infer_type(schema)
    assert pa.bool_() == lit(False).infer_type(schema)
    assert pa.string() == lit(False).cast(str).infer_type(schema)
    assert pa.float64() == lit(2.2).infer_type(schema)
    assert null().infer_type(schema) is None
    assert pa.string() == null().cast(str).infer_type(schema)

    assert function("a", col("a").cast("int")).infer_type(schema) is None
    assert pa.string() == function(
        "a",
        col("a").cast("int")).cast(str).infer_type(schema)
예제 #13
0
 def map(
     self,
     df: DataFrame,
     map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
     output_schema: Any,
     partition_spec: PartitionSpec,
     metadata: Any = None,
     on_init: Optional[Callable[[int, DataFrame], Any]] = None,
 ) -> DataFrame:
     if (self.conf.get_or_throw(FUGUE_SPARK_CONF_USE_PANDAS_UDF, bool)
             and hasattr(ps.DataFrame, "mapInPandas")  # new pyspark
             and not any(
                 pa.types.is_nested(t)
                 for t in Schema(output_schema).types)):
         # pandas udf can only be used for pyspark > 3
         if len(partition_spec.partition_by
                ) > 0 and partition_spec.algo != "even":
             return self._group_map_by_pandas_udf(
                 df,
                 map_func=map_func,
                 output_schema=output_schema,
                 partition_spec=partition_spec,
                 metadata=metadata,
                 on_init=on_init,
             )
         elif len(partition_spec.partition_by) == 0:
             return self._map_by_pandas_udf(
                 df,
                 map_func=map_func,
                 output_schema=output_schema,
                 partition_spec=partition_spec,
                 metadata=metadata,
                 on_init=on_init,
             )
     df = self.to_df(self.repartition(df, partition_spec))
     mapper = _Mapper(df, map_func, output_schema, partition_spec, on_init)
     sdf = df.native.rdd.mapPartitionsWithIndex(mapper.run, True)
     return self.to_df(sdf, output_schema, metadata)
예제 #14
0
def to_schema(schema: ibis.Schema) -> Schema:
    fields = [(n, _ibis_to_pa_type(t))
              for n, t in zip(schema.names, schema.types)]
    return Schema(fields)
예제 #15
0
def test_select_columns():
    # not all with names
    cols = SelectColumns(col("a"), lit(1, "b"),
                         col("bb") + col("cc"), f.first(col("c")))
    assert to_uuid(cols) == to_uuid(cols)
    raises(ValueError, lambda: cols.assert_all_with_names())

    # distinct
    cols2 = SelectColumns(
        col("a"),
        lit(1, "b"),
        col("bb") + col("cc"),
        f.first(col("c")),
        arg_distinct=True,
    )
    assert to_uuid(cols) != to_uuid(cols2)

    # duplicated names
    cols = SelectColumns(col("a").alias("b"), lit(1, "b"))
    assert to_uuid(cols) != to_uuid(
        SelectColumns(col("a").alias("b"), lit(1, "c")))
    raises(ValueError, lambda: cols.assert_all_with_names())

    # with *, all cols must have alias
    cols = SelectColumns(col("*"), col("a")).assert_no_agg()
    raises(ValueError, lambda: cols.assert_all_with_names())

    # * can be used at most once
    raises(ValueError, lambda: SelectColumns(col("*"), col("*"),
                                             col("a").alias("p")))

    # * can't be used with aggregation
    raises(ValueError, lambda: SelectColumns(col("*"),
                                             f.first(col("a")).alias("x")))

    cols = SelectColumns(
        col("aa").alias("a").cast(int),
        lit(1, "b"),
        (col("bb") + col("cc")).alias("c"),
        f.first(col("c")).alias("d"),
    ).assert_all_with_names()
    raises(AssertionError, lambda: cols.assert_no_agg())
    assert not cols.simple
    assert 1 == len(cols.simple_cols)
    assert "CAST(aa AS long) AS a" == str(cols.simple_cols[0])
    assert cols.has_literals
    assert 1 == len(cols.literals)
    assert "1 AS b" == str(cols.literals[0])
    assert cols.has_agg
    assert 1 == len(cols.non_agg_funcs)
    assert "+(bb,cc) AS c" == str(cols.non_agg_funcs[0])
    assert 1 == len(cols.agg_funcs)
    assert "FIRST(c) AS d" == str(cols.agg_funcs[0])
    assert 2 == len(cols.group_keys)  # a, c
    assert "aa" == cols.group_keys[0].output_name
    assert "" == cols.group_keys[1].output_name
    assert isinstance(cols.group_keys[1], _BinaryOpExpr)

    cols = SelectColumns(col("a")).assert_no_wildcard()
    assert cols.simple
    assert not cols.has_literals
    assert not cols.has_agg

    cols = SelectColumns(col("x"), col("*"), col("y") + col("z"))
    cols = cols.replace_wildcard(Schema("a:int,b:int"))
    assert "x" == str(cols.all_cols[0])
예제 #16
0
def test_functions():
    schema = Schema("a:int,b:str,c:bool,d:double")

    expr = f.coalesce(col("a"), 1, None, col("b") + col("c"))
    assert "COALESCE(a,1,NULL,+(b,c))" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.min(col("a"))
    assert "MIN(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)
    assert "MIN(a) AS a" == str(expr.infer_alias())
    assert "CAST(MIN(a) AS long) AS a" == str(expr.cast(int).infer_alias())
    assert "MIN(a) AS b" == str(expr.alias("b").infer_alias())

    assert "MIN(-(a)) AS a" == str(f.min(-col("a")).infer_alias())

    expr = f.min(lit(1.1))
    assert "MIN(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.max(col("a"))
    assert "MAX(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.max(lit(1.1))
    assert "MAX(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.first(col("a"))
    assert "FIRST(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.first(lit(1.1))
    assert "FIRST(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.last(col("a"))
    assert "LAST(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.last(lit(1.1))
    assert "LAST(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.avg(col("a"))
    assert "AVG(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.sum(col("a"))
    assert "SUM(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.count(col("a"))
    assert "COUNT(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.count_distinct(col("a"))
    assert "COUNT(DISTINCT a)" == str(expr)
    assert expr.infer_type(schema) is None
    assert "COUNT(DISTINCT a) AS a" == str(expr.infer_alias())

    expr = f.count_distinct(col("*"))
    assert "COUNT(DISTINCT *)" == str(expr)
    assert expr.infer_type(schema) is None
    assert "COUNT(DISTINCT *)" == str(expr.infer_alias())