Пример #1
0
        def test_assign(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.assign(
                a,
                [
                    lit(1, "x"),
                    col("b").cast(str), (col("b") + 1).alias("c").cast(int)
                ],
            )
            df_eq(
                b,
                [
                    [1, "2", 1, 3],
                    [None, "2", 1, 3],
                    [None, "1", 1, 2],
                    [3, "4", 1, 5],
                    [None, "4", 1, 5],
                ],
                "a:double,b:str,x:long,c:long",
                throw=True,
            )
Пример #2
0
def test_where():
    gen = SQLExpressionGenerator()
    assert "SELECT * FROM x WHERE (a<5) AND b IS NULL" == gen.where(
        (col("a") < 5) & col("b").is_null(), "x")
    assert "SELECT * FROM x WHERE a<5" == gen.where((col("a") < 5).alias("x"),
                                                    "x")
    raises(ValueError, lambda: gen.where(f.max(col("a")), "x"))
Пример #3
0
def test_select_exprs():
    gen = SQLExpressionGenerator()
    assert "(a+2)*3" == gen.generate((col("a") + 2) * 3)
    assert "(-a+2)*3" == gen.generate((-col("a") + 2) * 3)
    assert "(a*2)/3 AS x" == gen.generate(((col("a") * 2) / 3).alias("x"))
    assert "COUNT(DISTINCT a) AS x" == gen.generate(
        (f.count_distinct(col("a"))).alias("x"))
Пример #4
0
def test_functions():
    gen = SQLExpressionGenerator()
    assert "COALESCE(a,b+c,(d+e)-1,NULL) IS NULL" == gen.generate(
        f.coalesce(col("a"),
                   col("b") + col("c"),
                   col("d") + col("e") - 1, null()).is_null())
    assert (
        "MY(MIN(x),MAX(y+1),AVG(z),2,aa=FIRST(a),bb=LAST('b'),cc=COUNT(DISTINCT *)) AS x"
        == gen.generate(
            function(
                "MY",
                f.min(col("x")),
                f.max(col("y") + 1),
                f.avg(col("z")),
                2,
                aa=f.first(col("a")),
                bb=f.last(lit("b")),
                cc=f.count_distinct(col("*")),
            ).alias("x")))

    def dummy(expr):
        yield "DUMMY"
        if expr.is_distinct:
            yield " D"

    gen.add_func_handler("MY", dummy)
    assert "DUMMY D AS x" == gen.generate(
        function("MY", 2, 3, arg_distinct=True).alias("x"))
Пример #5
0
def test_basic():
    gen = SQLExpressionGenerator()
    assert "a" == gen.generate(col("a"))
    assert "a AS bc" == gen.generate(col("a").alias("bc"))

    assert "'a'" == gen.generate(lit("a"))
    assert "'a' AS bc" == gen.generate(lit("a").alias("bc"))

    assert "CAST(a AS long) AS a" == gen.generate(col("a").cast(int))
Пример #6
0
def test_is_agg():
    assert f.is_agg(f.first(col("a")))
    assert f.is_agg(f.count_distinct(col("a")).alias("x"))
    assert f.is_agg(f.first(col("a") + 1))
    assert f.is_agg(f.first(col("a")) + 1)
    assert f.is_agg((f.first(col("a")) < 1).alias("x"))
    assert f.is_agg(col("a") * f.first(col("a")) + 1)

    assert not f.is_agg(col("a"))
    assert not f.is_agg(lit("a"))
    assert not f.is_agg(col("a") + col("b"))
    assert not f.is_agg(null())
Пример #7
0
def test_conditions():
    gen = SQLExpressionGenerator()
    assert "(a=-1) AND (b>=c)" == gen.generate((col("a") == -1)
                                               & (col("b") >= col("c")))
    assert "TRUE AND (b>=c)" == gen.generate(True & (col("b") >= col("c")))
    assert "TRUE AND NOT (b>=c)" == gen.generate(True
                                                 & ~(col("b") >= col("c")))
    assert "TRUE OR (b>=c) IS NOT NULL" == gen.generate(True | (
        col("b") >= col("c")).not_null())
Пример #8
0
 def test_filter(self):
     e = self.engine
     o = ArrayDataFrame(
         [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
         "a:double,b:int",
         dict(a=1),
     )
     a = e.to_df(o)
     b = e.filter(a, col("a").not_null())
     df_eq(b, [[1, 2], [3, 4]], "a:double,b:int", throw=True)
     c = e.filter(a, col("a").not_null() & (col("b") < 3))
     df_eq(c, [[1, 2]], "a:double,b:int", throw=True)
     c = e.filter(a, col("a") + col("b") == 3)
     df_eq(c, [[1, 2]], "a:double,b:int", throw=True)
Пример #9
0
def test_lit_col():
    assert "NULL" == str(lit(None))
    assert "TRUE" == str(null().is_null())
    assert "FALSE" == str(null().not_null())

    assert "'a'" == str(lit("a"))
    assert "'a\"\\'\\\\'" == str(lit("a\"'\\"))
    assert "'a' AS x" == str(lit("a", "x"))
    assert "TRUE" == str(lit("a").not_null())
    assert "FALSE" == str(lit("a").is_null())

    assert "1.1" == str(lit(1.1))
    assert "11" == str(lit(11))
    assert "TRUE" == str(lit(True))
    assert "FALSE" == str(lit(False))

    assert "1 AS xx" == str(lit(1).alias("xx"))
    assert "'ab' AS xx" == str(lit("ab").alias("xx"))

    raises(NotImplementedError, lambda: lit([1, 2]))

    assert to_uuid(lit("a")) != to_uuid(col("a"))
    assert to_uuid(lit(1)) != to_uuid(lit("1"))
    assert to_uuid(null()) == to_uuid(null())
    assert to_uuid(null()) != to_uuid(lit(1))
    assert to_uuid(lit("a")) != to_uuid(lit("a").alias("v"))
    assert to_uuid(lit("a")) != to_uuid(lit("a").cast(int))
    assert to_uuid(lit("a").cast(int).alias("v")) == to_uuid(
        lit("a").alias("v").cast(int))
Пример #10
0
    def filter(
        self, df: DataFrame, condition: ColumnExpr, metadata: Any = None
    ) -> DataFrame:
        """Filter rows by the given condition

        :param df: the dataframe to be filtered
        :param condition: (boolean) column expression
        :param metadata: dict-like object to add to the result dataframe,
            defaults to None. It's currently not used
        :return: the filtered dataframe

        .. admonition:: New Since
            :class: hint

            **0.6.0**

        .. seealso::

            Please find more expression examples in :mod:`fugue.column.sql` and
            :mod:`fugue.column.functions`

        .. admonition:: Examples

            .. code-block:: python

                import fugue.column.functions as f

                engine.filter(df, (col("a")>1) & (col("b")=="x"))
                engine.filter(df, f.coalesce(col("a"),col("b"))>1)
        """
        return self.select(
            df, cols=SelectColumns(col("*")), where=condition, metadata=metadata
        )
Пример #11
0
        def test_aggregate(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.aggregate(
                df=a,
                partition_spec=None,
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(b, [[4, 8]], "b:int,c:int", throw=True)

            b = e.aggregate(
                df=a,
                partition_spec=PartitionSpec(by=["a"]),
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(
                b,
                [[None, 4, 8], [1, 2, 4], [3, 4, 8]],
                "a:double,b:int,c:int",
                throw=True,
            )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[ff.max(col("b")), lit(1)],
                )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[],
                )
Пример #12
0
    def assign(
        self, df: DataFrame, columns: List[ColumnExpr], metadata: Any = None
    ) -> DataFrame:
        """Update existing columns with new values and add new columns

        :param df: the dataframe to set columns
        :param columns: column expressions
        :param metadata: dict-like object to add to the result dataframe,
            defaults to None. It's currently not used
        :return: the updated dataframe

        .. tip::

            This can be used to cast data types, alter column values or add new
            columns. But you can't use aggregation in columns.

        .. admonition:: New Since
            :class: hint

            **0.6.0**

        .. seealso::

            Please find more expression examples in :mod:`fugue.column.sql` and
            :mod:`fugue.column.functions`

        .. admonition:: Examples

            .. code-block:: python

                # assume df has schema: a:int,b:str

                # add constant column x
                engine.assign(df, lit(1,"x"))

                # change column b to be a constant integer
                engine.assign(df, lit(1,"b"))

                # add new x to be a+b
                engine.assign(df, (col("a")+col("b")).alias("x"))

                # cast column a data type to double
                engine.assign(df, col("a").cast(float))
        """
        SelectColumns(
            *columns
        ).assert_no_wildcard().assert_all_with_names().assert_no_agg()

        cols = [col(n) for n in df.schema.names]
        for c in columns:
            if c.output_name not in df.schema:
                cols.append(c)
            else:
                cols[df.schema.index_of_key(c.output_name)] = c
        return self.select(df, SelectColumns(*cols), metadata=metadata)
Пример #13
0
    def aggregate(
        self,
        df: DataFrame,
        partition_spec: Optional[PartitionSpec],
        agg_cols: List[ColumnExpr],
        metadata: Any = None,
    ):
        """Aggregate on dataframe

        :param df: the dataframe to aggregate on
        :param partition_spec: PartitionSpec to specify partition keys
        :param agg_cols: aggregation expressions
        :param metadata: dict-like object to add to the result dataframe,
            defaults to None. It's currently not used
        :return: the aggregated result as a dataframe

        .. admonition:: New Since
            :class: hint

            **0.6.0**

        .. seealso::

            Please find more expression examples in :mod:`fugue.column.sql` and
            :mod:`fugue.column.functions`

        .. admonition:: Examples

            .. code-block:: python

                import fugue.column.functions as f

                # SELECT MAX(b) AS b FROM df
                engine.aggregate(
                    df,
                    partition_spec=None,
                    agg_cols=[f.max(col("b"))])

                # SELECT a, MAX(b) AS x FROM df GROUP BY a
                engine.aggregate(
                    df,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[f.max(col("b")).alias("x")])
        """
        assert_or_throw(len(agg_cols) > 0, ValueError("agg_cols can't be empty"))
        assert_or_throw(
            all(is_agg(x) for x in agg_cols),
            ValueError("all agg_cols must be aggregation functions"),
        )
        keys: List[ColumnExpr] = []
        if partition_spec is not None and len(partition_spec.partition_by) > 0:
            keys = [col(y) for y in partition_spec.partition_by]
        cols = SelectColumns(*keys, *agg_cols)
        return self.select(df, cols=cols, metadata=metadata)
Пример #14
0
def test_correct_select_schema():
    schema = Schema("a:double,b:str")
    gen = SQLExpressionGenerator()

    sc = SelectColumns(col("*"), col("c"))
    output = Schema("a:double,b:str,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c is None

    output = Schema("a:int,b:int,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c == "a:double,b:str"

    sc = SelectColumns(f.count(col("*")).alias("t"), col("c").alias("a"))
    output = Schema("t:int,a:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c is None

    sc = SelectColumns((col("a") + col("b")).cast(str).alias("a"), lit(1, "c"))
    output = Schema("a:int,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c == "a:str,c:long"
Пример #15
0
        def test_select(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            # simple
            b = e.select(
                a, SelectColumns(col("b"),
                                 (col("b") + 1).alias("c").cast(str)))
            df_eq(
                b,
                [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]],
                "b:int,c:str",
                throw=True,
            )

            # with distinct
            b = e.select(
                a,
                SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str),
                              arg_distinct=True),
            )
            df_eq(
                b,
                [[2, "3"], [1, "2"], [4, "5"]],
                "b:int,c:str",
                throw=True,
            )

            # wildcard
            b = e.select(a,
                         SelectColumns(col("*")),
                         where=col("a") + col("b") == 3)
            df_eq(b, [[1, 2]], "a:double,b:int", throw=True)

            # aggregation
            b = e.select(
                a,
                SelectColumns(col("a"),
                              ff.sum(col("b")).cast(float).alias("b")))
            df_eq(b, [[1, 2], [3, 4], [None, 7]],
                  "a:double,b:double",
                  throw=True)

            # having
            # https://github.com/fugue-project/fugue/issues/222
            col_b = ff.sum(col("b"))
            b = e.select(
                a,
                SelectColumns(col("a"),
                              col_b.cast(float).alias("b")),
                having=(col_b >= 7) | (col("a") == 1),
            )
            df_eq(b, [[1, 2], [None, 7]], "a:double,b:double", throw=True)

            # literal + alias inference
            # https://github.com/fugue-project/fugue/issues/222
            col_b = ff.sum(col("b"))
            b = e.select(
                a,
                SelectColumns(col("a"),
                              lit(1, "o").cast(str), col_b.cast(float)),
                having=(col_b >= 7) | (col("a") == 1),
            )
            df_eq(b, [[1, "1", 2], [None, "1", 7]],
                  "a:double,o:str,b:double",
                  throw=True)
Пример #16
0
def test_functions():
    schema = Schema("a:int,b:str,c:bool,d:double")

    expr = f.coalesce(col("a"), 1, None, col("b") + col("c"))
    assert "COALESCE(a,1,NULL,+(b,c))" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.min(col("a"))
    assert "MIN(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)
    assert "MIN(a) AS a" == str(expr.infer_alias())
    assert "CAST(MIN(a) AS long) AS a" == str(expr.cast(int).infer_alias())
    assert "MIN(a) AS b" == str(expr.alias("b").infer_alias())

    assert "MIN(-(a)) AS a" == str(f.min(-col("a")).infer_alias())

    expr = f.min(lit(1.1))
    assert "MIN(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.max(col("a"))
    assert "MAX(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.max(lit(1.1))
    assert "MAX(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.first(col("a"))
    assert "FIRST(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.first(lit(1.1))
    assert "FIRST(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.last(col("a"))
    assert "LAST(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.last(lit(1.1))
    assert "LAST(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.avg(col("a"))
    assert "AVG(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.sum(col("a"))
    assert "SUM(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.count(col("a"))
    assert "COUNT(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.count_distinct(col("a"))
    assert "COUNT(DISTINCT a)" == str(expr)
    assert expr.infer_type(schema) is None
    assert "COUNT(DISTINCT a) AS a" == str(expr.infer_alias())

    expr = f.count_distinct(col("*"))
    assert "COUNT(DISTINCT *)" == str(expr)
    assert expr.infer_type(schema) is None
    assert "COUNT(DISTINCT *)" == str(expr.infer_alias())
Пример #17
0
def test_select():
    gen = SQLExpressionGenerator()

    # no aggregation
    cols = SelectColumns(col("*"))
    assert "SELECT * FROM x" == gen.select(cols, "x")

    cols = SelectColumns(col("a"),
                         lit(1).alias("b"), (col("b") + col("c")).alias("x"))
    where = (col("a") > 5).alias("aa")
    assert "SELECT a, 1 AS b, b+c AS x FROM t WHERE a>5" == gen.select(
        cols, "t", where=where)

    # aggregation without literals
    cols = SelectColumns(f.max(col("c")).alias("c"), col("a", "aa"), col("b"))
    assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select(
        cols, "t")

    where = col("a") < 10
    having = (f.max(col("a")) > 5).alias("aaa")
    assert (
        "SELECT MAX(c) AS c, a AS aa, b FROM t WHERE a<10 GROUP BY a, b HAVING MAX(a)>5"
        == gen.select(cols, "t", where=where, having=having))

    cols = SelectColumns(
        f.min(col("c") + 1).alias("c"),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM t" == gen.select(
        cols, "t")

    # aggregation with literals
    cols = SelectColumns(lit(1, "k"),
                         f.max(col("c")).alias("c"), lit(2, "j"),
                         col("a", "aa"), col("b"))
    assert (
        "SELECT 1 AS k, c, 2 AS j, aa, b FROM (SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b)"
        == gen.select(cols, "t"))

    cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j"))
    assert "SELECT 1 AS k, c, 2 AS j FROM (SELECT MAX(c) AS c FROM t)" == gen.select(
        cols, "t")

    cols = SelectColumns(lit(1, "k"), col("a"),
                         f.max(col("c")).alias("c"), lit(2, "j"))
    assert (
        "SELECT 1 AS k, a, c, 2 AS j FROM (SELECT a, MAX(c) AS c FROM t GROUP BY a)"
        == gen.select(cols, "t"))

    # cast
    cols = SelectColumns(
        col("c").cast(float),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert (
        "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM t GROUP BY c"
        == gen.select(cols, "t"))

    # infer alias
    cols = SelectColumns(
        (-col("c")).cast(float),
        f.max(col("e")).cast(int),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert ("SELECT CAST(-c AS double) AS c, CAST(MAX(e) AS long) AS e, "
            "CAST(AVG(d+e) AS long) AS d FROM t GROUP BY -c" == gen.select(
                cols, "t"))
Пример #18
0
def test_binary_op():
    assert "+(ab,1)" == str(col("ab") + 1)
    assert "+(ab,x)" == str(col("ab") + col("x"))
    assert "+('x',a)" == str("x" + col("a"))
    assert "+('x','a')" == str("x" + lit("a"))
    assert "-(a,1)" == str(col("a") - 1)
    assert "-(1.1,a)" == str(1.1 - col("a"))
    assert "*(a,1)" == str(col("a") * 1)
    assert "*(1.1,a)" == str(1.1 * col("a"))
    assert "/(a,1)" == str(col("a") / 1)
    assert "/(1.1,a)" == str(1.1 / col("a"))

    assert "+(ab,1)" == str((col("ab") + 1))
    assert "+(ab,1) AS xx" == str((col("ab") + 1).alias("xx"))

    assert "+(ab,1) AS xx" == str((col("ab") + 1).alias("xx"))

    assert "&(a,TRUE)" == str(col("a") & True)
    assert "&(TRUE,a)" == str(True & col("a"))
    assert "&(a,FALSE)" == str(col("a") & False)
    assert "&(FALSE,a)" == str(False & col("a"))

    assert "|(a,TRUE)" == str(col("a") | True)
    assert "|(TRUE,a)" == str(True | col("a"))
    assert "|(a,FALSE)" == str(col("a") | False)
    assert "|(FALSE,a)" == str(False | col("a"))

    assert "<(a,1)" == str(col("a") < 1)
    assert "<(a,b)" == str(col("a") < col("b"))
    assert ">(a,1.1)" == str(1.1 < col("a"))
    assert "<(1.1,a)" == str(lit(1.1) < col("a"))
    assert "<=(a,1)" == str(col("a") <= 1)
    assert ">=(a,1.1)" == str(1.1 <= col("a"))
    assert ">(a,1)" == str(col("a") > 1)
    assert "<(a,1.1)" == str(1.1 > col("a"))
    assert ">=(a,1)" == str(col("a") >= 1)
    assert "<=(a,1.1)" == str(1.1 >= col("a"))

    assert "==(a,1)" == str(col("a") == 1)
    assert "==(a,1.1)" == str(1.1 == col("a"))
    assert "!=(a,1)" == str(col("a") != 1)
    assert "!=(a,1.1)" == str(1.1 != col("a"))
Пример #19
0
def test_named_col():
    assert "*" == str(col("*"))
    assert col("*").wildcard
    assert "" == col("*").infer_alias().output_name
    raises(ValueError, lambda: col("*").alias("x"))
    raises(ValueError, lambda: col("*").cast("long"))

    assert "a" == str(col("a"))
    assert not col("a").wildcard
    assert "a" == str(col(col("a")))
    assert "ab AS xx" == str(col("ab").alias("xx"))
    assert "ab AS xx" == str(col("ab", "xx").cast(None))
    assert "CAST(ab AS long) AS xx" == str(col("ab", "xx").cast("long"))

    assert "ab AS xx" == str(col("ab").alias("xx"))

    assert "ab AS xx" == str(col("ab").alias("xx"))
    assert "CAST(ab AS long) AS xx" == str(col("ab").alias("xx").cast(int))

    raises(NotImplementedError, lambda: col([1, 2]))

    assert to_uuid(col("a")) != to_uuid(col("b"))
    assert to_uuid(col("a")) != to_uuid(col("a").alias("v"))
    assert to_uuid(col("a")) != to_uuid(col("a").cast(int))
    assert to_uuid(col("a").cast(int).alias("v")) == to_uuid(
        col("a").alias("v").cast(int))

    assert "" == col("a").infer_alias().as_name
    assert "a" == str(col("a").infer_alias())
    assert "a" == col("a").cast(int).infer_alias().as_name
    c = col("a").cast(int).infer_alias()
    assert "CAST(a AS long) AS a" == str(c)
    c = col("a").cast(int).alias("x").infer_alias()
    assert "CAST(a AS long) AS x" == str(c)
Пример #20
0
def test_unary_op():
    assert "-(a)" == str(-col("a"))
    assert "a" == (-col("a")).infer_alias().output_name
    assert "a" == str(+col("a"))
    assert "~(a)" == str(~col("a"))
    assert "IS_NULL(a)" == str(col("a").is_null())
    assert "NOT_NULL(a)" == str(col("a").not_null())

    assert "NOT_NULL(a) AS xx" == str(col("a").not_null().alias("xx"))
    assert "NOT_NULL(a)" == str(col("a").not_null())
    assert "NOT_NULL(a) AS xx" == str(col("a").not_null().alias("xx"))

    assert "a" == col("a").not_null().infer_alias().output_name
    assert "NOT_NULL(a) AS a" == str(col("a").not_null().infer_alias())

    assert to_uuid(col("a").not_null()) == to_uuid(col("a").not_null())
    assert to_uuid(col("a").not_null()) != to_uuid(col("a").is_null())
Пример #21
0
def test_get_column_mentions():
    expr = (col("a") + col("b")) * function(
        "x", col("b"), a=col("c"), b=lit(1))
    assert set(["a", "b", "c"]) == set(_get_column_mentions(expr))
Пример #22
0
def test_schema_inference():
    schema = Schema("a:int,b:str,c:bool,d:double")
    assert pa.int32() == col("a").infer_type(schema)
    assert pa.int32() == (-col("a")).infer_type(schema)
    assert pa.int64() == (-col("a")).cast(int).infer_type(schema)
    assert pa.int64() == (-col("a").cast(int)).infer_type(schema)
    assert pa.string() == col("b").infer_type(schema)
    assert (-col("b")).infer_type(schema) is None
    assert (~col("b")).infer_type(schema) is None
    assert pa.bool_() == col("c").infer_type(schema)
    assert pa.bool_() == (~col("c")).alias("x").infer_type(schema)
    assert pa.float64() == col("d").infer_type(schema)
    assert pa.float64() == (-col("d").alias("x")).infer_type(schema)
    assert col("x").infer_type(schema) is None
    assert pa.string() == col("x").cast(str).infer_type(schema)
    assert col("*").infer_type(schema) is None

    assert pa.bool_() == (col("a") < col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") > col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") <= col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") >= col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") == col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") != col("d")).infer_type(schema)
    assert pa.bool_() == (~(col("a") != col("d"))).infer_type(schema)
    assert pa.int64() == (~(col("a") != col("d"))).cast(int).infer_type(schema)

    assert (col("a") - col("d")).infer_type(schema) is None

    assert pa.int64() == lit(1).infer_type(schema)
    assert pa.string() == lit("a").infer_type(schema)
    assert pa.bool_() == lit(False).infer_type(schema)
    assert pa.string() == lit(False).cast(str).infer_type(schema)
    assert pa.float64() == lit(2.2).infer_type(schema)
    assert null().infer_type(schema) is None
    assert pa.string() == null().cast(str).infer_type(schema)

    assert function("a", col("a").cast("int")).infer_type(schema) is None
    assert pa.string() == function(
        "a",
        col("a").cast("int")).cast(str).infer_type(schema)
Пример #23
0
def test_coalesce():
    expr = coalesce(col("x") + col("z"), col("y"), 1, 1.1, False, "t")
    assert "COALESCE(+(x,z),y,1,1.1,FALSE,'t')" == str(expr)
    assert "COALESCE(+(x,z),y,1,1.1,FALSE,'t') AS x" == str(expr.alias("x"))
Пример #24
0
def test_function():
    expr = function("f", col("x") + col("z"), col("y"), 1, 1.1, False, "t")
    assert "f(+(x,z),y,1,1.1,FALSE,'t')" == str(expr)
    assert "f(+(x,z),y,1,1.1,FALSE,'t') AS x" == str(expr.alias("x"))
Пример #25
0
def test_comb():
    assert "-(+(a,*(10,b)),/(c,d))" == str((col("a") + 10 * col("b")) -
                                           col("c") / col("d"))
    assert "|(==(a,1.1),&(&(b,~(c)),TRUE))" == str((1.1 == col("a")) | col("b")
                                                   & ~col("c") & True)
Пример #26
0
def test_no_cast():
    gen = SQLExpressionGenerator(enable_cast=False)
    cols = SelectColumns(
        f.max(col("c")).cast("long").alias("c"), col("a", "aa"), col("b"))
    assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select(
        cols, "t")
Пример #27
0
def test_select_columns():
    # not all with names
    cols = SelectColumns(col("a"), lit(1, "b"),
                         col("bb") + col("cc"), f.first(col("c")))
    assert to_uuid(cols) == to_uuid(cols)
    raises(ValueError, lambda: cols.assert_all_with_names())

    # distinct
    cols2 = SelectColumns(
        col("a"),
        lit(1, "b"),
        col("bb") + col("cc"),
        f.first(col("c")),
        arg_distinct=True,
    )
    assert to_uuid(cols) != to_uuid(cols2)

    # duplicated names
    cols = SelectColumns(col("a").alias("b"), lit(1, "b"))
    assert to_uuid(cols) != to_uuid(
        SelectColumns(col("a").alias("b"), lit(1, "c")))
    raises(ValueError, lambda: cols.assert_all_with_names())

    # with *, all cols must have alias
    cols = SelectColumns(col("*"), col("a")).assert_no_agg()
    raises(ValueError, lambda: cols.assert_all_with_names())

    # * can be used at most once
    raises(ValueError, lambda: SelectColumns(col("*"), col("*"),
                                             col("a").alias("p")))

    # * can't be used with aggregation
    raises(ValueError, lambda: SelectColumns(col("*"),
                                             f.first(col("a")).alias("x")))

    cols = SelectColumns(
        col("aa").alias("a").cast(int),
        lit(1, "b"),
        (col("bb") + col("cc")).alias("c"),
        f.first(col("c")).alias("d"),
    ).assert_all_with_names()
    raises(AssertionError, lambda: cols.assert_no_agg())
    assert not cols.simple
    assert 1 == len(cols.simple_cols)
    assert "CAST(aa AS long) AS a" == str(cols.simple_cols[0])
    assert cols.has_literals
    assert 1 == len(cols.literals)
    assert "1 AS b" == str(cols.literals[0])
    assert cols.has_agg
    assert 1 == len(cols.non_agg_funcs)
    assert "+(bb,cc) AS c" == str(cols.non_agg_funcs[0])
    assert 1 == len(cols.agg_funcs)
    assert "FIRST(c) AS d" == str(cols.agg_funcs[0])
    assert 2 == len(cols.group_keys)  # a, c
    assert "aa" == cols.group_keys[0].output_name
    assert "" == cols.group_keys[1].output_name
    assert isinstance(cols.group_keys[1], _BinaryOpExpr)

    cols = SelectColumns(col("a")).assert_no_wildcard()
    assert cols.simple
    assert not cols.has_literals
    assert not cols.has_agg

    cols = SelectColumns(col("x"), col("*"), col("y") + col("z"))
    cols = cols.replace_wildcard(Schema("a:int,b:int"))
    assert "x" == str(cols.all_cols[0])