Пример #1
0
    def test_kudu_schema_convert(self):
        spec = [
            # name, type, is_nullable, is_primary_key
            ('a', dt.Int8(False), 'int8', False, True),
            ('b', dt.Int16(False), 'int16', False, True),
            ('c', dt.Int32(False), 'int32', False, False),
            ('d', dt.Int64(True), 'int64', True, False),
            ('e', dt.String(True), 'string', True, False),
            ('f', dt.Boolean(False), 'bool', False, False),
            ('g', dt.Float(False), 'float', False, False),
            ('h', dt.Double(True), 'double', True, False),
            # TODO
            # ('i', 'binary', False, False),
            ('j', dt.Timestamp(True), 'timestamp', True, False),
        ]

        builder = kudu.schema_builder()
        primary_keys = []
        ibis_types = []
        for name, itype, type_, is_nullable, is_primary_key in spec:
            builder.add_column(name, type_, nullable=is_nullable)

            if is_primary_key:
                primary_keys.append(name)

            ibis_types.append((name, itype))

        builder.set_primary_keys(primary_keys)
        kschema = builder.build()

        ischema = ksupport.schema_kudu_to_ibis(kschema)
        expected = ibis.schema(ibis_types)

        assert_equal(ischema, expected)
Пример #2
0
def test_database_layer(con, alltypes):
    db = con.database()
    t = db.functional_alltypes

    assert_equal(t, alltypes)

    assert db.list_tables() == con.list_tables()
Пример #3
0
def test_window_bind_to_table(t):
    w = ibis.window(group_by="g", order_by=ibis.desc("f"))

    w2 = w.bind(t)
    expected = ibis.window(group_by=t.g, order_by=ibis.desc(t.f))

    assert_equal(w2, expected)
Пример #4
0
    def test_sqla_schema_conversion(self):
        typespec = [
            # name, type, nullable
            ("smallint", sat.SmallInteger, False, dt.int16),
            ("int", sat.Integer, True, dt.int32),
            ("integer", sat.INTEGER(), True, dt.int64),
            ("bigint", sat.BigInteger, False, dt.int64),
            ("real", sat.REAL, True, dt.double),
            ("bool", sat.Boolean, True, dt.boolean),
            ("timestamp", sat.DateTime, True, dt.timestamp),
        ]

        sqla_types = []
        ibis_types = []
        for name, t, nullable, ibis_type in typespec:
            sqla_type = sa.Column(name, t, nullable=nullable)
            sqla_types.append(sqla_type)
            ibis_types.append((name, ibis_type(nullable)))

        table = sa.Table("tname", self.meta, *sqla_types)

        schema = alch.schema_from_table(table)
        expected = ibis.schema(ibis_types)

        assert_equal(schema, expected)
Пример #5
0
def test_mutate(table):
    one = table.f * 2
    foo = (table.a + table.b).name('foo')

    expr = table.mutate(foo, one=one, two=2)
    expected = table[table, foo, one.name('one'), ibis.literal(2).name('two')]
    assert_equal(expr, expected)
Пример #6
0
def test_create_table_with_partition_column(con, temp_table_db):
    schema = ibis.schema(
        [
            ('year', 'int32'),
            ('month', 'string'),
            ('day', 'int8'),
            ('value', 'double'),
        ]
    )

    tmp_db, name = temp_table_db
    con.create_table(
        name, schema=schema, database=tmp_db, partition=['year', 'month']
    )

    # the partition column get put at the end of the table
    ex_schema = ibis.schema(
        [
            ('day', 'int8'),
            ('value', 'double'),
            ('year', 'int32'),
            ('month', 'string'),
        ]
    )
    table_schema = con.get_schema(name, database=tmp_db)
    assert_equal(table_schema, ex_schema)

    partition_schema = con.database(tmp_db).table(name).partition_schema()

    expected = ibis.schema([('year', 'int32'), ('month', 'string')])
    assert_equal(partition_schema, expected)
Пример #7
0
def test_group_by_kwargs(table):
    t = table
    expr = (t.group_by(['f', t.h], z='g', z2=t.d)
             .aggregate(t.d.mean().name('foo')))
    expected = (t.group_by(['f', t.h, t.g.name('z'), t.d.name('z2')])
                .aggregate(t.d.mean().name('foo')))
    assert_equal(expr, expected)
Пример #8
0
    def test_set_column(self):
        def g(x):
            return x.f * 2

        result = self.table.set_column('f', g)
        expected = self.table.set_column('f', self.table.f * 2)
        assert_equal(result, expected)
Пример #9
0
    def test_add_column(self):
        def g(x):
            return x.f * 2

        result = self.table.add_column(g, name='foo')
        expected = self.table.mutate(foo=g)
        assert_equal(result, expected)
Пример #10
0
    def test_summary_expand_list(self):
        summ = self.table.f.summary()

        metric = self.table.g.group_concat().name('bar')
        result = self.table.aggregate([metric, summ])
        expected = self.table.aggregate([metric] + summ.exprs())
        assert_equal(result, expected)
Пример #11
0
    def test_rewrite_join_projection_without_other_ops(self):
        # Drop out filters and other commutative table operations. Join
        # predicates are "lifted" to reference the base, unmodified join roots

        # Star schema with fact table
        table = self.con.table('star1')
        table2 = self.con.table('star2')
        table3 = self.con.table('star3')

        filtered = table[table['f'] > 0]

        pred1 = table['foo_id'] == table2['foo_id']
        pred2 = filtered['bar_id'] == table3['bar_id']

        j1 = filtered.left_join(table2, [pred1])
        j2 = j1.inner_join(table3, [pred2])

        # Project out the desired fields
        view = j2[[filtered, table2['value1'], table3['value2']]]

        # Construct the thing we expect to obtain
        ex_pred2 = table['bar_id'] == table3['bar_id']
        ex_expr = (table.left_join(table2, [pred1])
                   .inner_join(table3, [ex_pred2]))

        rewritten_proj = L.substitute_parents(view)
        op = rewritten_proj.op()
        assert_equal(op.table, ex_expr)

        # Ensure that filtered table has been substituted with the base table
        assert op.selections[0] is table
Пример #12
0
    def test_value_counts_convenience(self):
        # #152
        result = self.table.g.value_counts()
        expected = (self.table.group_by('g')
                    .aggregate(self.table.count().name('count')))

        assert_equal(result, expected)
Пример #13
0
def test_unravel_compound_equijoin(table):
    t1 = ibis.table(
        [
            ('key1', 'string'),
            ('key2', 'string'),
            ('key3', 'string'),
            ('value1', 'double'),
        ],
        'foo_table',
    )

    t2 = ibis.table(
        [
            ('key1', 'string'),
            ('key2', 'string'),
            ('key3', 'string'),
            ('value2', 'double'),
        ],
        'bar_table',
    )

    p1 = t1.key1 == t2.key1
    p2 = t1.key2 == t2.key2
    p3 = t1.key3 == t2.key3

    joined = t1.inner_join(t2, [p1 & p2 & p3])
    expected = t1.inner_join(t2, [p1, p2, p3])
    assert_equal(joined, expected)
Пример #14
0
    def test_create_table_with_partition_column(self):
        schema = ibis.schema([('year', 'int32'),
                              ('month', 'int8'),
                              ('day', 'int8'),
                              ('value', 'double')])

        name = _tmp_name()
        self.con.create_table(name, schema=schema,
                              database=self.tmp_db,
                              partition=['year', 'month'],
                              location=self._temp_location())
        self.temp_tables.append(name)

        # the partition column get put at the end of the table
        ex_schema = ibis.schema([('day', 'int8'),
                                 ('value', 'double'),
                                 ('year', 'int32'),
                                 ('month', 'int8')])
        table_schema = self.con.get_schema(name, database=self.tmp_db)
        assert_equal(table_schema, ex_schema)

        partition_schema = self.db.table(name).partition_schema()

        expected = ibis.schema([('year', 'int32'),
                                ('month', 'int8')])
        assert_equal(partition_schema, expected)
Пример #15
0
def test_having(table):
    m = table.mutate(foo=table.f * 2, bar=table.e / 2)

    expr = m.group_by('foo').having(lambda x: x.foo.sum() > 10).size()
    expected = m.group_by('foo').having(m.foo.sum() > 10).size()

    assert_equal(expr, expected)
Пример #16
0
def test_contains(table):
    expr = table.g.contains('foo')
    expected = table.g.find('foo') >= 0
    assert_equal(expr, expected)

    with pytest.raises(TypeError):
        'foo' in table.g
Пример #17
0
    def test_self_join(self):
        # Self-joins are problematic with this design because column
        # expressions may reference either the left or right self. For example:
        #
        # SELECT left.key, sum(left.value - right.value) as total_deltas
        # FROM table left
        #  INNER JOIN table right
        #    ON left.current_period = right.previous_period + 1
        # GROUP BY 1
        #
        # One way around the self-join issue is to force the user to add
        # prefixes to the joined fields, then project using those. Not that
        # satisfying, though.
        left = self.table
        right = self.table.view()
        metric = (left['a'] - right['b']).mean().name('metric')

        joined = left.inner_join(right, [right['g'] == left['g']])
        # basic check there's no referential problems
        result_repr = repr(joined)
        assert 'ref_0' in result_repr
        assert 'ref_1' in result_repr

        # Cannot be immediately materialized because of the schema overlap
        self.assertRaises(RelationError, joined.materialize)

        # Project out left table schema
        proj = joined[[left]]
        assert_equal(proj.schema(), left.schema())

        # Try aggregating on top of joined
        aggregated = joined.aggregate([metric], by=[left['g']])
        ex_schema = api.Schema(['g', 'metric'], ['string', 'double'])
        assert_equal(aggregated.schema(), ex_schema)
Пример #18
0
    def test_database_layer(self):
        db = self.con.database()

        t = db.functional_alltypes
        assert_equal(t, self.alltypes)

        assert db.list_tables() == self.con.list_tables()
Пример #19
0
def test_set_column(table):
    def g(x):
        return x.f * 2

    result = table.set_column('f', g)
    expected = table.set_column('f', table.f * 2)
    assert_equal(result, expected)
Пример #20
0
    def test_null(self):
        expr = ibis.literal(None)
        assert isinstance(expr, ir.NullScalar)
        assert isinstance(expr.op(), ir.NullLiteral)

        expr2 = ibis.null()
        assert_equal(expr, expr2)
Пример #21
0
def test_null():
    expr = ibis.literal(None)
    assert isinstance(expr, ir.NullScalar)
    assert isinstance(expr.op(), ir.NullLiteral)
    assert expr._arg.value is None

    expr2 = ibis.null()
    assert_equal(expr, expr2)
Пример #22
0
def test_groupby_mutate(table):
    t = table

    g = t.group_by('g').order_by('f')
    expr = g.mutate(foo=lambda x: x.f.lag(), bar=lambda x: x.f.rank())
    expected = g.mutate(foo=t.f.lag(), bar=t.f.rank())

    assert_equal(expr, expected)
Пример #23
0
def test_replace_column(table):
    tb = api.table([('a', 'int32'), ('b', 'double'), ('c', 'string')])

    expr = tb.b.cast('int32')
    tb2 = tb.set_column('b', expr)
    expected = tb[tb.a, expr.name('b'), tb.c]

    assert_equal(tb2, expected)
Пример #24
0
    def test_coalesce_instance_method(self):
        v7 = self.table.v7
        v5 = self.table.v5.cast('string')
        v8 = self.table.v8.cast('string')

        result = v7.coalesce(v5, v8, 'foo')
        expected = ibis.coalesce(v7, v5, v8, 'foo')
        assert_equal(result, expected)
Пример #25
0
    def test_sql_with_limit(self):
        query = """\
SELECT *
FROM functional_alltypes
LIMIT 10"""
        table = self.con.sql(query)
        ex_schema = self.con.get_schema('functional_alltypes')
        assert_equal(table.schema(), ex_schema)
Пример #26
0
    def test_query_parquet_file_like_table(self):
        hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region")

        ex_schema = ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")])

        table = self.con.parquet_file(hdfs_path, like_table="tpch_region")

        assert_equal(table.schema(), ex_schema)
Пример #27
0
    def test_mutate(self):
        one = self.table.f * 2
        foo = (self.table.a + self.table.b).name('foo')

        expr = self.table.mutate(foo, one=one, two=2)
        expected = self.table[self.table, foo, one.name('one'),
                              ibis.literal(2).name('two')]
        assert_equal(expr, expected)
Пример #28
0
    def test_join_no_predicate_list(self):
        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')

        pred = region.r_regionkey == nation.n_regionkey
        joined = region.inner_join(nation, pred)
        expected = region.inner_join(nation, [pred])
        assert_equal(joined, expected)
Пример #29
0
    def test_window_bind_to_table(self):
        w = ibis.window(group_by='g', order_by=ibis.desc('f'))

        w2 = w.bind(self.t)
        expected = ibis.window(group_by=self.t.g,
                               order_by=ibis.desc(self.t.f))

        assert_equal(w2, expected)
Пример #30
0
    def test_getitem_slice(self):
        cases = [
            (self.table.g[:3], self.table.g.substr(0, 3)),
            (self.table.g[2:6], self.table.g.substr(2, 4)),
        ]

        for case, expected in cases:
            assert_equal(case, expected)
Пример #31
0
 def test_projection_array_expr(self):
     result = self.table[self.table.a]
     expected = self.table[[self.table.a]]
     assert_equal(result, expected)
Пример #32
0
def test_sql_with_limit(con):
    table = con.sql("SELECT * FROM functional_alltypes LIMIT 10")
    ex_schema = con.get_schema('functional_alltypes')
    assert_equal(table.schema(), ex_schema)
Пример #33
0
def test_add_column_proxies_to_mutate(table):
    result = table.add_column(ibis.now().cast('date'), name='date')
    expected = table.mutate(date=ibis.now().cast('date'))
    assert_equal(result, expected)
Пример #34
0
def test_get_schema(con, test_data_db):
    t = con.table('tpch_lineitem')
    schema = con.get_schema('tpch_lineitem', database=test_data_db)
    assert_equal(t.schema(), schema)
Пример #35
0
 def test_projection_convenient_syntax(self):
     proj = self.table[self.table, self.table['a'].name('foo')]
     proj2 = self.table[[self.table, self.table['a'].name('foo')]]
     assert_equal(proj, proj2)
Пример #36
0
def test_projection_self(table):
    result = table[table]
    expected = table.projection(table)

    assert_equal(result, expected)
Пример #37
0
def test_topk_function_late_bind(airlines):
    # GH #520
    expr1 = airlines.dest.topk(5, by=lambda x: x.arrdelay.mean())
    expr2 = airlines.dest.topk(5, by=airlines.arrdelay.mean())

    assert_equal(expr1.to_aggregation(), expr2.to_aggregation())
Пример #38
0
def test_distinct_count(dtable):
    result = dtable.string_col.distinct().count()
    expected = dtable.string_col.nunique().name('count')
    assert_equal(result, expected)
    assert isinstance(result.op(), ops.CountDistinct)
Пример #39
0
def test_projection_convenient_syntax(table):
    proj = table[table, table['a'].name('foo')]
    proj2 = table[[table, table['a'].name('foo')]]
    assert_equal(proj, proj2)
Пример #40
0
def test_projection_no_list(table):
    expr = (table.f * 2).name('bar')
    result = table.select(expr)
    expected = table.projection([expr])
    assert_equal(result, expected)
Пример #41
0
 def test_get_schema(self):
     t = self.con.table('tpch_lineitem')
     schema = self.con.get_schema('tpch_lineitem',
                                  database=self.test_data_db)
     assert_equal(t.schema(), schema)
Пример #42
0
def test_value_counts_unnamed_expr(con):
    nation = con.table('tpch_nation')

    expr = nation.n_name.lower().value_counts()
    expected = nation.n_name.lower().name('unnamed').value_counts()
    assert_equal(expr, expected)
Пример #43
0
def test_value_counts_convenience(table):
    # #152
    result = table.g.value_counts()
    expected = table.group_by('g').aggregate(table.count().name('count'))

    assert_equal(result, expected)
Пример #44
0
def test_lineage(companies):
    # single table dependency
    funding_buckets = [
        0,
        1000000,
        10000000,
        50000000,
        100000000,
        500000000,
        1000000000,
    ]

    bucket = companies.funding_total_usd.bucket(
        funding_buckets, include_over=True
    )

    mutated = companies.mutate(
        bucket=bucket, status=companies.status.fillna('Unknown')
    )

    filtered = mutated[
        (companies.founded_at > '2010-01-01') | companies.founded_at.isnull()
    ]

    grouped = filtered.group_by(['bucket', 'status']).size()

    results = list(lin.lineage(bucket))
    expected = [bucket, companies.funding_total_usd, companies]
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(mutated.bucket))
    expected = [
        mutated.bucket,
        mutated,
        bucket.name('bucket'),
        bucket,
        companies.funding_total_usd,
        companies,
    ]
    assert len(results) == len(expected)
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(filtered.bucket))
    expected = [
        filtered.bucket,
        filtered,
        bucket.name('bucket'),
        bucket,
        companies.funding_total_usd,
        companies,
    ]
    assert len(results) == len(expected)
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(grouped.bucket))
    expected = [
        grouped.bucket,
        grouped,
        filtered.bucket,
        filtered,
        bucket.name('bucket'),
        bucket,
        companies.funding_total_usd,
        companies,
    ]
    assert len(results) == len(expected)
    for r, e in zip(results, expected):
        assert_equal(r, e)
Пример #45
0
    def test_projection_self(self):
        result = self.table[self.table]
        expected = self.table.projection(self.table)

        assert_equal(result, expected)
Пример #46
0
    def test_contains(self):
        expr = self.table.g.contains('foo')
        expected = self.table.g.like('%foo%')
        assert_equal(expr, expected)

        self.assertRaises(Exception, lambda: 'foo' in self.table.g)
Пример #47
0
def test_projection_array_expr(table):
    result = table[table.a]
    expected = table[[table.a]]
    assert_equal(result, expected)
Пример #48
0
def test_groupby_alias(table):
    t = table

    result = t.groupby('g').size()
    expected = t.group_by('g').size()
    assert_equal(result, expected)
Пример #49
0
def test_lineage(companies):
    # single table dependency
    funding_buckets = [
        0,
        1000000,
        10000000,
        50000000,
        100000000,
        500000000,
        1000000000,
    ]

    bucket_names = [
        '0 to 1m',
        '1m to 10m',
        '10m to 50m',
        '50m to 100m',
        '100m to 500m',
        '500m to 1b',
        'Over 1b',
    ]

    bucket = companies.funding_total_usd.bucket(
        funding_buckets, include_over=True
    )

    mutated = companies.mutate(
        bucket=bucket, status=companies.status.fillna('Unknown')
    )

    filtered = mutated[
        (companies.founded_at > '2010-01-01') | companies.founded_at.isnull()
    ]

    grouped = filtered.group_by(['bucket', 'status']).size()

    # TODO(cpcloud): Should this be used?
    joined = grouped.mutate(  # noqa
        bucket_name=lambda x: x.bucket.label(bucket_names).fillna('Unknown')
    )

    results = list(lin.lineage(bucket))
    expected = [bucket, companies.funding_total_usd, companies]
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(mutated.bucket))
    expected = [
        mutated.bucket,
        mutated,
        bucket.name('bucket'),
        companies.funding_total_usd,
        companies,
    ]
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(filtered.bucket))
    expected = [
        filtered.bucket,
        filtered,
        bucket.name('bucket'),
        companies.funding_total_usd,
        companies,
    ]
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(grouped.bucket))
    expected = [
        grouped.bucket,
        grouped,
        filtered.bucket,
        filtered,
        bucket.name('bucket'),
        companies.funding_total_usd,
        companies,
    ]
    for r, e in zip(results, expected):
        assert_equal(r, e)
Пример #50
0
def test_filter_no_list(table):
    pred = table.a > 5

    result = table.filter(pred)
    expected = table[pred]
    assert_equal(result, expected)
Пример #51
0
def test_distinct_count(functional_alltypes):
    result = functional_alltypes.string_col.distinct().count()
    expected = functional_alltypes.string_col.nunique().name('count')
    assert_equal(result, expected)
    assert isinstance(result.op(), ops.CountDistinct)