Пример #1
0
def test_basic():
    gen = SQLExpressionGenerator()
    assert "a" == gen.generate(col("a"))
    assert "a AS bc" == gen.generate(col("a").alias("bc"))

    assert "'a'" == gen.generate(lit("a"))
    assert "'a' AS bc" == gen.generate(lit("a").alias("bc"))

    assert "CAST(a AS long) AS a" == gen.generate(col("a").cast(int))
Пример #2
0
def test_functions():
    gen = SQLExpressionGenerator()
    assert "COALESCE(a,b+c,(d+e)-1,NULL) IS NULL" == gen.generate(
        f.coalesce(col("a"),
                   col("b") + col("c"),
                   col("d") + col("e") - 1, null()).is_null())
    assert (
        "MY(MIN(x),MAX(y+1),AVG(z),2,aa=FIRST(a),bb=LAST('b'),cc=COUNT(DISTINCT *)) AS x"
        == gen.generate(
            function(
                "MY",
                f.min(col("x")),
                f.max(col("y") + 1),
                f.avg(col("z")),
                2,
                aa=f.first(col("a")),
                bb=f.last(lit("b")),
                cc=f.count_distinct(col("*")),
            ).alias("x")))

    def dummy(expr):
        yield "DUMMY"
        if expr.is_distinct:
            yield " D"

    gen.add_func_handler("MY", dummy)
    assert "DUMMY D AS x" == gen.generate(
        function("MY", 2, 3, arg_distinct=True).alias("x"))
Пример #3
0
        def test_assign(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.assign(
                a,
                [
                    lit(1, "x"),
                    col("b").cast(str), (col("b") + 1).alias("c").cast(int)
                ],
            )
            df_eq(
                b,
                [
                    [1, "2", 1, 3],
                    [None, "2", 1, 3],
                    [None, "1", 1, 2],
                    [3, "4", 1, 5],
                    [None, "4", 1, 5],
                ],
                "a:double,b:str,x:long,c:long",
                throw=True,
            )
Пример #4
0
def test_binary_op():
    assert "+(ab,1)" == str(col("ab") + 1)
    assert "+(ab,x)" == str(col("ab") + col("x"))
    assert "+('x',a)" == str("x" + col("a"))
    assert "+('x','a')" == str("x" + lit("a"))
    assert "-(a,1)" == str(col("a") - 1)
    assert "-(1.1,a)" == str(1.1 - col("a"))
    assert "*(a,1)" == str(col("a") * 1)
    assert "*(1.1,a)" == str(1.1 * col("a"))
    assert "/(a,1)" == str(col("a") / 1)
    assert "/(1.1,a)" == str(1.1 / col("a"))

    assert "+(ab,1)" == str((col("ab") + 1))
    assert "+(ab,1) AS xx" == str((col("ab") + 1).alias("xx"))

    assert "+(ab,1) AS xx" == str((col("ab") + 1).alias("xx"))

    assert "&(a,TRUE)" == str(col("a") & True)
    assert "&(TRUE,a)" == str(True & col("a"))
    assert "&(a,FALSE)" == str(col("a") & False)
    assert "&(FALSE,a)" == str(False & col("a"))

    assert "|(a,TRUE)" == str(col("a") | True)
    assert "|(TRUE,a)" == str(True | col("a"))
    assert "|(a,FALSE)" == str(col("a") | False)
    assert "|(FALSE,a)" == str(False | col("a"))

    assert "<(a,1)" == str(col("a") < 1)
    assert "<(a,b)" == str(col("a") < col("b"))
    assert ">(a,1.1)" == str(1.1 < col("a"))
    assert "<(1.1,a)" == str(lit(1.1) < col("a"))
    assert "<=(a,1)" == str(col("a") <= 1)
    assert ">=(a,1.1)" == str(1.1 <= col("a"))
    assert ">(a,1)" == str(col("a") > 1)
    assert "<(a,1.1)" == str(1.1 > col("a"))
    assert ">=(a,1)" == str(col("a") >= 1)
    assert "<=(a,1.1)" == str(1.1 >= col("a"))

    assert "==(a,1)" == str(col("a") == 1)
    assert "==(a,1.1)" == str(1.1 == col("a"))
    assert "!=(a,1)" == str(col("a") != 1)
    assert "!=(a,1.1)" == str(1.1 != col("a"))
Пример #5
0
def test_is_agg():
    assert f.is_agg(f.first(col("a")))
    assert f.is_agg(f.count_distinct(col("a")).alias("x"))
    assert f.is_agg(f.first(col("a") + 1))
    assert f.is_agg(f.first(col("a")) + 1)
    assert f.is_agg((f.first(col("a")) < 1).alias("x"))
    assert f.is_agg(col("a") * f.first(col("a")) + 1)

    assert not f.is_agg(col("a"))
    assert not f.is_agg(lit("a"))
    assert not f.is_agg(col("a") + col("b"))
    assert not f.is_agg(null())
Пример #6
0
def test_schema_inference():
    schema = Schema("a:int,b:str,c:bool,d:double")
    assert pa.int32() == col("a").infer_type(schema)
    assert pa.int32() == (-col("a")).infer_type(schema)
    assert pa.int64() == (-col("a")).cast(int).infer_type(schema)
    assert pa.int64() == (-col("a").cast(int)).infer_type(schema)
    assert pa.string() == col("b").infer_type(schema)
    assert (-col("b")).infer_type(schema) is None
    assert (~col("b")).infer_type(schema) is None
    assert pa.bool_() == col("c").infer_type(schema)
    assert pa.bool_() == (~col("c")).alias("x").infer_type(schema)
    assert pa.float64() == col("d").infer_type(schema)
    assert pa.float64() == (-col("d").alias("x")).infer_type(schema)
    assert col("x").infer_type(schema) is None
    assert pa.string() == col("x").cast(str).infer_type(schema)
    assert col("*").infer_type(schema) is None

    assert pa.bool_() == (col("a") < col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") > col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") <= col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") >= col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") == col("d")).infer_type(schema)
    assert pa.bool_() == (col("a") != col("d")).infer_type(schema)
    assert pa.bool_() == (~(col("a") != col("d"))).infer_type(schema)
    assert pa.int64() == (~(col("a") != col("d"))).cast(int).infer_type(schema)

    assert (col("a") - col("d")).infer_type(schema) is None

    assert pa.int64() == lit(1).infer_type(schema)
    assert pa.string() == lit("a").infer_type(schema)
    assert pa.bool_() == lit(False).infer_type(schema)
    assert pa.string() == lit(False).cast(str).infer_type(schema)
    assert pa.float64() == lit(2.2).infer_type(schema)
    assert null().infer_type(schema) is None
    assert pa.string() == null().cast(str).infer_type(schema)

    assert function("a", col("a").cast("int")).infer_type(schema) is None
    assert pa.string() == function(
        "a",
        col("a").cast("int")).cast(str).infer_type(schema)
Пример #7
0
        def test_aggregate(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.aggregate(
                df=a,
                partition_spec=None,
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(b, [[4, 8]], "b:int,c:int", throw=True)

            b = e.aggregate(
                df=a,
                partition_spec=PartitionSpec(by=["a"]),
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(
                b,
                [[None, 4, 8], [1, 2, 4], [3, 4, 8]],
                "a:double,b:int,c:int",
                throw=True,
            )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[ff.max(col("b")), lit(1)],
                )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[],
                )
Пример #8
0
def test_correct_select_schema():
    schema = Schema("a:double,b:str")
    gen = SQLExpressionGenerator()

    sc = SelectColumns(col("*"), col("c"))
    output = Schema("a:double,b:str,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c is None

    output = Schema("a:int,b:int,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c == "a:double,b:str"

    sc = SelectColumns(f.count(col("*")).alias("t"), col("c").alias("a"))
    output = Schema("t:int,a:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c is None

    sc = SelectColumns((col("a") + col("b")).cast(str).alias("a"), lit(1, "c"))
    output = Schema("a:int,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c == "a:str,c:long"
Пример #9
0
def test_lit_col():
    assert "NULL" == str(lit(None))
    assert "TRUE" == str(null().is_null())
    assert "FALSE" == str(null().not_null())

    assert "'a'" == str(lit("a"))
    assert "'a\"\\'\\\\'" == str(lit("a\"'\\"))
    assert "'a' AS x" == str(lit("a", "x"))
    assert "TRUE" == str(lit("a").not_null())
    assert "FALSE" == str(lit("a").is_null())

    assert "1.1" == str(lit(1.1))
    assert "11" == str(lit(11))
    assert "TRUE" == str(lit(True))
    assert "FALSE" == str(lit(False))

    assert "1 AS xx" == str(lit(1).alias("xx"))
    assert "'ab' AS xx" == str(lit("ab").alias("xx"))

    raises(NotImplementedError, lambda: lit([1, 2]))

    assert to_uuid(lit("a")) != to_uuid(col("a"))
    assert to_uuid(lit(1)) != to_uuid(lit("1"))
    assert to_uuid(null()) == to_uuid(null())
    assert to_uuid(null()) != to_uuid(lit(1))
    assert to_uuid(lit("a")) != to_uuid(lit("a").alias("v"))
    assert to_uuid(lit("a")) != to_uuid(lit("a").cast(int))
    assert to_uuid(lit("a").cast(int).alias("v")) == to_uuid(
        lit("a").alias("v").cast(int))
Пример #10
0
def test_get_column_mentions():
    expr = (col("a") + col("b")) * function(
        "x", col("b"), a=col("c"), b=lit(1))
    assert set(["a", "b", "c"]) == set(_get_column_mentions(expr))
Пример #11
0
def test_select_columns():
    # not all with names
    cols = SelectColumns(col("a"), lit(1, "b"),
                         col("bb") + col("cc"), f.first(col("c")))
    assert to_uuid(cols) == to_uuid(cols)
    raises(ValueError, lambda: cols.assert_all_with_names())

    # distinct
    cols2 = SelectColumns(
        col("a"),
        lit(1, "b"),
        col("bb") + col("cc"),
        f.first(col("c")),
        arg_distinct=True,
    )
    assert to_uuid(cols) != to_uuid(cols2)

    # duplicated names
    cols = SelectColumns(col("a").alias("b"), lit(1, "b"))
    assert to_uuid(cols) != to_uuid(
        SelectColumns(col("a").alias("b"), lit(1, "c")))
    raises(ValueError, lambda: cols.assert_all_with_names())

    # with *, all cols must have alias
    cols = SelectColumns(col("*"), col("a")).assert_no_agg()
    raises(ValueError, lambda: cols.assert_all_with_names())

    # * can be used at most once
    raises(ValueError, lambda: SelectColumns(col("*"), col("*"),
                                             col("a").alias("p")))

    # * can't be used with aggregation
    raises(ValueError, lambda: SelectColumns(col("*"),
                                             f.first(col("a")).alias("x")))

    cols = SelectColumns(
        col("aa").alias("a").cast(int),
        lit(1, "b"),
        (col("bb") + col("cc")).alias("c"),
        f.first(col("c")).alias("d"),
    ).assert_all_with_names()
    raises(AssertionError, lambda: cols.assert_no_agg())
    assert not cols.simple
    assert 1 == len(cols.simple_cols)
    assert "CAST(aa AS long) AS a" == str(cols.simple_cols[0])
    assert cols.has_literals
    assert 1 == len(cols.literals)
    assert "1 AS b" == str(cols.literals[0])
    assert cols.has_agg
    assert 1 == len(cols.non_agg_funcs)
    assert "+(bb,cc) AS c" == str(cols.non_agg_funcs[0])
    assert 1 == len(cols.agg_funcs)
    assert "FIRST(c) AS d" == str(cols.agg_funcs[0])
    assert 2 == len(cols.group_keys)  # a, c
    assert "aa" == cols.group_keys[0].output_name
    assert "" == cols.group_keys[1].output_name
    assert isinstance(cols.group_keys[1], _BinaryOpExpr)

    cols = SelectColumns(col("a")).assert_no_wildcard()
    assert cols.simple
    assert not cols.has_literals
    assert not cols.has_agg

    cols = SelectColumns(col("x"), col("*"), col("y") + col("z"))
    cols = cols.replace_wildcard(Schema("a:int,b:int"))
    assert "x" == str(cols.all_cols[0])
Пример #12
0
def test_select():
    gen = SQLExpressionGenerator()

    # no aggregation
    cols = SelectColumns(col("*"))
    assert "SELECT * FROM x" == gen.select(cols, "x")

    cols = SelectColumns(col("a"),
                         lit(1).alias("b"), (col("b") + col("c")).alias("x"))
    where = (col("a") > 5).alias("aa")
    assert "SELECT a, 1 AS b, b+c AS x FROM t WHERE a>5" == gen.select(
        cols, "t", where=where)

    # aggregation without literals
    cols = SelectColumns(f.max(col("c")).alias("c"), col("a", "aa"), col("b"))
    assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select(
        cols, "t")

    where = col("a") < 10
    having = (f.max(col("a")) > 5).alias("aaa")
    assert (
        "SELECT MAX(c) AS c, a AS aa, b FROM t WHERE a<10 GROUP BY a, b HAVING MAX(a)>5"
        == gen.select(cols, "t", where=where, having=having))

    cols = SelectColumns(
        f.min(col("c") + 1).alias("c"),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM t" == gen.select(
        cols, "t")

    # aggregation with literals
    cols = SelectColumns(lit(1, "k"),
                         f.max(col("c")).alias("c"), lit(2, "j"),
                         col("a", "aa"), col("b"))
    assert (
        "SELECT 1 AS k, c, 2 AS j, aa, b FROM (SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b)"
        == gen.select(cols, "t"))

    cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j"))
    assert "SELECT 1 AS k, c, 2 AS j FROM (SELECT MAX(c) AS c FROM t)" == gen.select(
        cols, "t")

    cols = SelectColumns(lit(1, "k"), col("a"),
                         f.max(col("c")).alias("c"), lit(2, "j"))
    assert (
        "SELECT 1 AS k, a, c, 2 AS j FROM (SELECT a, MAX(c) AS c FROM t GROUP BY a)"
        == gen.select(cols, "t"))

    # cast
    cols = SelectColumns(
        col("c").cast(float),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert (
        "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM t GROUP BY c"
        == gen.select(cols, "t"))

    # infer alias
    cols = SelectColumns(
        (-col("c")).cast(float),
        f.max(col("e")).cast(int),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert ("SELECT CAST(-c AS double) AS c, CAST(MAX(e) AS long) AS e, "
            "CAST(AVG(d+e) AS long) AS d FROM t GROUP BY -c" == gen.select(
                cols, "t"))
Пример #13
0
def test_functions():
    schema = Schema("a:int,b:str,c:bool,d:double")

    expr = f.coalesce(col("a"), 1, None, col("b") + col("c"))
    assert "COALESCE(a,1,NULL,+(b,c))" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.min(col("a"))
    assert "MIN(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)
    assert "MIN(a) AS a" == str(expr.infer_alias())
    assert "CAST(MIN(a) AS long) AS a" == str(expr.cast(int).infer_alias())
    assert "MIN(a) AS b" == str(expr.alias("b").infer_alias())

    assert "MIN(-(a)) AS a" == str(f.min(-col("a")).infer_alias())

    expr = f.min(lit(1.1))
    assert "MIN(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.max(col("a"))
    assert "MAX(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.max(lit(1.1))
    assert "MAX(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.first(col("a"))
    assert "FIRST(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.first(lit(1.1))
    assert "FIRST(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.last(col("a"))
    assert "LAST(a)" == str(expr)
    assert pa.int32() == expr.infer_type(schema)

    expr = f.last(lit(1.1))
    assert "LAST(1.1)" == str(expr)
    assert pa.float64() == expr.infer_type(schema)

    expr = f.avg(col("a"))
    assert "AVG(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.sum(col("a"))
    assert "SUM(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.count(col("a"))
    assert "COUNT(a)" == str(expr)
    assert expr.infer_type(schema) is None

    expr = f.count_distinct(col("a"))
    assert "COUNT(DISTINCT a)" == str(expr)
    assert expr.infer_type(schema) is None
    assert "COUNT(DISTINCT a) AS a" == str(expr.infer_alias())

    expr = f.count_distinct(col("*"))
    assert "COUNT(DISTINCT *)" == str(expr)
    assert expr.infer_type(schema) is None
    assert "COUNT(DISTINCT *)" == str(expr.infer_alias())
Пример #14
0
        def test_select(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            # simple
            b = e.select(
                a, SelectColumns(col("b"),
                                 (col("b") + 1).alias("c").cast(str)))
            df_eq(
                b,
                [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]],
                "b:int,c:str",
                throw=True,
            )

            # with distinct
            b = e.select(
                a,
                SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str),
                              arg_distinct=True),
            )
            df_eq(
                b,
                [[2, "3"], [1, "2"], [4, "5"]],
                "b:int,c:str",
                throw=True,
            )

            # wildcard
            b = e.select(a,
                         SelectColumns(col("*")),
                         where=col("a") + col("b") == 3)
            df_eq(b, [[1, 2]], "a:double,b:int", throw=True)

            # aggregation
            b = e.select(
                a,
                SelectColumns(col("a"),
                              ff.sum(col("b")).cast(float).alias("b")))
            df_eq(b, [[1, 2], [3, 4], [None, 7]],
                  "a:double,b:double",
                  throw=True)

            # having
            # https://github.com/fugue-project/fugue/issues/222
            col_b = ff.sum(col("b"))
            b = e.select(
                a,
                SelectColumns(col("a"),
                              col_b.cast(float).alias("b")),
                having=(col_b >= 7) | (col("a") == 1),
            )
            df_eq(b, [[1, 2], [None, 7]], "a:double,b:double", throw=True)

            # literal + alias inference
            # https://github.com/fugue-project/fugue/issues/222
            col_b = ff.sum(col("b"))
            b = e.select(
                a,
                SelectColumns(col("a"),
                              lit(1, "o").cast(str), col_b.cast(float)),
                having=(col_b >= 7) | (col("a") == 1),
            )
            df_eq(b, [[1, "1", 2], [None, "1", 7]],
                  "a:double,o:str,b:double",
                  throw=True)