예제 #1
0
    def test_kudu_schema_convert(self):
        spec = [
            # name, type, is_nullable, is_primary_key
            ('a', dt.Int8(False), 'int8', False, True),
            ('b', dt.Int16(False), 'int16', False, True),
            ('c', dt.Int32(False), 'int32', False, False),
            ('d', dt.Int64(True), 'int64', True, False),
            ('e', dt.String(True), 'string', True, False),
            ('f', dt.Boolean(False), 'bool', False, False),
            ('g', dt.Float(False), 'float', False, False),
            ('h', dt.Double(True), 'double', True, False),
            # TODO
            # ('i', 'binary', False, False),
            ('j', dt.Timestamp(True), 'timestamp', True, False),
        ]

        builder = kudu.schema_builder()
        primary_keys = []
        ibis_types = []
        for name, itype, type_, is_nullable, is_primary_key in spec:
            builder.add_column(name, type_, nullable=is_nullable)

            if is_primary_key:
                primary_keys.append(name)

            ibis_types.append((name, itype))

        builder.set_primary_keys(primary_keys)
        kschema = builder.build()

        ischema = ksupport.schema_kudu_to_ibis(kschema)
        expected = ibis.schema(ibis_types)

        assert_equal(ischema, expected)
예제 #2
0
def test_database_layer(con, alltypes):
    db = con.database()
    t = db.functional_alltypes

    assert_equal(t, alltypes)

    assert db.list_tables() == con.list_tables()
예제 #3
0
def test_window_bind_to_table(t):
    w = ibis.window(group_by="g", order_by=ibis.desc("f"))

    w2 = w.bind(t)
    expected = ibis.window(group_by=t.g, order_by=ibis.desc(t.f))

    assert_equal(w2, expected)
예제 #4
0
    def test_sqla_schema_conversion(self):
        typespec = [
            # name, type, nullable
            ("smallint", sat.SmallInteger, False, dt.int16),
            ("int", sat.Integer, True, dt.int32),
            ("integer", sat.INTEGER(), True, dt.int64),
            ("bigint", sat.BigInteger, False, dt.int64),
            ("real", sat.REAL, True, dt.double),
            ("bool", sat.Boolean, True, dt.boolean),
            ("timestamp", sat.DateTime, True, dt.timestamp),
        ]

        sqla_types = []
        ibis_types = []
        for name, t, nullable, ibis_type in typespec:
            sqla_type = sa.Column(name, t, nullable=nullable)
            sqla_types.append(sqla_type)
            ibis_types.append((name, ibis_type(nullable)))

        table = sa.Table("tname", self.meta, *sqla_types)

        schema = alch.schema_from_table(table)
        expected = ibis.schema(ibis_types)

        assert_equal(schema, expected)
예제 #5
0
파일: test_table.py 프로젝트: cloudera/ibis
def test_mutate(table):
    one = table.f * 2
    foo = (table.a + table.b).name('foo')

    expr = table.mutate(foo, one=one, two=2)
    expected = table[table, foo, one.name('one'), ibis.literal(2).name('two')]
    assert_equal(expr, expected)
예제 #6
0
def test_create_table_with_partition_column(con, temp_table_db):
    schema = ibis.schema(
        [
            ('year', 'int32'),
            ('month', 'string'),
            ('day', 'int8'),
            ('value', 'double'),
        ]
    )

    tmp_db, name = temp_table_db
    con.create_table(
        name, schema=schema, database=tmp_db, partition=['year', 'month']
    )

    # the partition column get put at the end of the table
    ex_schema = ibis.schema(
        [
            ('day', 'int8'),
            ('value', 'double'),
            ('year', 'int32'),
            ('month', 'string'),
        ]
    )
    table_schema = con.get_schema(name, database=tmp_db)
    assert_equal(table_schema, ex_schema)

    partition_schema = con.database(tmp_db).table(name).partition_schema()

    expected = ibis.schema([('year', 'int32'), ('month', 'string')])
    assert_equal(partition_schema, expected)
예제 #7
0
def test_group_by_kwargs(table):
    t = table
    expr = (t.group_by(['f', t.h], z='g', z2=t.d)
             .aggregate(t.d.mean().name('foo')))
    expected = (t.group_by(['f', t.h, t.g.name('z'), t.d.name('z2')])
                .aggregate(t.d.mean().name('foo')))
    assert_equal(expr, expected)
예제 #8
0
    def test_set_column(self):
        def g(x):
            return x.f * 2

        result = self.table.set_column('f', g)
        expected = self.table.set_column('f', self.table.f * 2)
        assert_equal(result, expected)
예제 #9
0
    def test_add_column(self):
        def g(x):
            return x.f * 2

        result = self.table.add_column(g, name='foo')
        expected = self.table.mutate(foo=g)
        assert_equal(result, expected)
예제 #10
0
    def test_summary_expand_list(self):
        summ = self.table.f.summary()

        metric = self.table.g.group_concat().name('bar')
        result = self.table.aggregate([metric, summ])
        expected = self.table.aggregate([metric] + summ.exprs())
        assert_equal(result, expected)
예제 #11
0
    def test_rewrite_join_projection_without_other_ops(self):
        # Drop out filters and other commutative table operations. Join
        # predicates are "lifted" to reference the base, unmodified join roots

        # Star schema with fact table
        table = self.con.table('star1')
        table2 = self.con.table('star2')
        table3 = self.con.table('star3')

        filtered = table[table['f'] > 0]

        pred1 = table['foo_id'] == table2['foo_id']
        pred2 = filtered['bar_id'] == table3['bar_id']

        j1 = filtered.left_join(table2, [pred1])
        j2 = j1.inner_join(table3, [pred2])

        # Project out the desired fields
        view = j2[[filtered, table2['value1'], table3['value2']]]

        # Construct the thing we expect to obtain
        ex_pred2 = table['bar_id'] == table3['bar_id']
        ex_expr = (table.left_join(table2, [pred1])
                   .inner_join(table3, [ex_pred2]))

        rewritten_proj = L.substitute_parents(view)
        op = rewritten_proj.op()
        assert_equal(op.table, ex_expr)

        # Ensure that filtered table has been substituted with the base table
        assert op.selections[0] is table
예제 #12
0
    def test_value_counts_convenience(self):
        # #152
        result = self.table.g.value_counts()
        expected = (self.table.group_by('g')
                    .aggregate(self.table.count().name('count')))

        assert_equal(result, expected)
예제 #13
0
파일: test_table.py 프로젝트: cloudera/ibis
def test_unravel_compound_equijoin(table):
    t1 = ibis.table(
        [
            ('key1', 'string'),
            ('key2', 'string'),
            ('key3', 'string'),
            ('value1', 'double'),
        ],
        'foo_table',
    )

    t2 = ibis.table(
        [
            ('key1', 'string'),
            ('key2', 'string'),
            ('key3', 'string'),
            ('value2', 'double'),
        ],
        'bar_table',
    )

    p1 = t1.key1 == t2.key1
    p2 = t1.key2 == t2.key2
    p3 = t1.key3 == t2.key3

    joined = t1.inner_join(t2, [p1 & p2 & p3])
    expected = t1.inner_join(t2, [p1, p2, p3])
    assert_equal(joined, expected)
예제 #14
0
    def test_create_table_with_partition_column(self):
        schema = ibis.schema([('year', 'int32'),
                              ('month', 'int8'),
                              ('day', 'int8'),
                              ('value', 'double')])

        name = _tmp_name()
        self.con.create_table(name, schema=schema,
                              database=self.tmp_db,
                              partition=['year', 'month'],
                              location=self._temp_location())
        self.temp_tables.append(name)

        # the partition column get put at the end of the table
        ex_schema = ibis.schema([('day', 'int8'),
                                 ('value', 'double'),
                                 ('year', 'int32'),
                                 ('month', 'int8')])
        table_schema = self.con.get_schema(name, database=self.tmp_db)
        assert_equal(table_schema, ex_schema)

        partition_schema = self.db.table(name).partition_schema()

        expected = ibis.schema([('year', 'int32'),
                                ('month', 'int8')])
        assert_equal(partition_schema, expected)
예제 #15
0
파일: test_table.py 프로젝트: cloudera/ibis
def test_having(table):
    m = table.mutate(foo=table.f * 2, bar=table.e / 2)

    expr = m.group_by('foo').having(lambda x: x.foo.sum() > 10).size()
    expected = m.group_by('foo').having(m.foo.sum() > 10).size()

    assert_equal(expr, expected)
예제 #16
0
def test_contains(table):
    expr = table.g.contains('foo')
    expected = table.g.find('foo') >= 0
    assert_equal(expr, expected)

    with pytest.raises(TypeError):
        'foo' in table.g
예제 #17
0
    def test_self_join(self):
        # Self-joins are problematic with this design because column
        # expressions may reference either the left or right self. For example:
        #
        # SELECT left.key, sum(left.value - right.value) as total_deltas
        # FROM table left
        #  INNER JOIN table right
        #    ON left.current_period = right.previous_period + 1
        # GROUP BY 1
        #
        # One way around the self-join issue is to force the user to add
        # prefixes to the joined fields, then project using those. Not that
        # satisfying, though.
        left = self.table
        right = self.table.view()
        metric = (left['a'] - right['b']).mean().name('metric')

        joined = left.inner_join(right, [right['g'] == left['g']])
        # basic check there's no referential problems
        result_repr = repr(joined)
        assert 'ref_0' in result_repr
        assert 'ref_1' in result_repr

        # Cannot be immediately materialized because of the schema overlap
        self.assertRaises(RelationError, joined.materialize)

        # Project out left table schema
        proj = joined[[left]]
        assert_equal(proj.schema(), left.schema())

        # Try aggregating on top of joined
        aggregated = joined.aggregate([metric], by=[left['g']])
        ex_schema = api.Schema(['g', 'metric'], ['string', 'double'])
        assert_equal(aggregated.schema(), ex_schema)
예제 #18
0
    def test_database_layer(self):
        db = self.con.database()

        t = db.functional_alltypes
        assert_equal(t, self.alltypes)

        assert db.list_tables() == self.con.list_tables()
예제 #19
0
def test_set_column(table):
    def g(x):
        return x.f * 2

    result = table.set_column('f', g)
    expected = table.set_column('f', table.f * 2)
    assert_equal(result, expected)
예제 #20
0
    def test_null(self):
        expr = ibis.literal(None)
        assert isinstance(expr, ir.NullScalar)
        assert isinstance(expr.op(), ir.NullLiteral)

        expr2 = ibis.null()
        assert_equal(expr, expr2)
예제 #21
0
def test_null():
    expr = ibis.literal(None)
    assert isinstance(expr, ir.NullScalar)
    assert isinstance(expr.op(), ir.NullLiteral)
    assert expr._arg.value is None

    expr2 = ibis.null()
    assert_equal(expr, expr2)
예제 #22
0
파일: test_table.py 프로젝트: cloudera/ibis
def test_groupby_mutate(table):
    t = table

    g = t.group_by('g').order_by('f')
    expr = g.mutate(foo=lambda x: x.f.lag(), bar=lambda x: x.f.rank())
    expected = g.mutate(foo=t.f.lag(), bar=t.f.rank())

    assert_equal(expr, expected)
예제 #23
0
파일: test_table.py 프로젝트: cloudera/ibis
def test_replace_column(table):
    tb = api.table([('a', 'int32'), ('b', 'double'), ('c', 'string')])

    expr = tb.b.cast('int32')
    tb2 = tb.set_column('b', expr)
    expected = tb[tb.a, expr.name('b'), tb.c]

    assert_equal(tb2, expected)
예제 #24
0
    def test_coalesce_instance_method(self):
        v7 = self.table.v7
        v5 = self.table.v5.cast('string')
        v8 = self.table.v8.cast('string')

        result = v7.coalesce(v5, v8, 'foo')
        expected = ibis.coalesce(v7, v5, v8, 'foo')
        assert_equal(result, expected)
예제 #25
0
    def test_sql_with_limit(self):
        query = """\
SELECT *
FROM functional_alltypes
LIMIT 10"""
        table = self.con.sql(query)
        ex_schema = self.con.get_schema('functional_alltypes')
        assert_equal(table.schema(), ex_schema)
예제 #26
0
    def test_query_parquet_file_like_table(self):
        hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region")

        ex_schema = ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")])

        table = self.con.parquet_file(hdfs_path, like_table="tpch_region")

        assert_equal(table.schema(), ex_schema)
예제 #27
0
    def test_mutate(self):
        one = self.table.f * 2
        foo = (self.table.a + self.table.b).name('foo')

        expr = self.table.mutate(foo, one=one, two=2)
        expected = self.table[self.table, foo, one.name('one'),
                              ibis.literal(2).name('two')]
        assert_equal(expr, expected)
예제 #28
0
    def test_join_no_predicate_list(self):
        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')

        pred = region.r_regionkey == nation.n_regionkey
        joined = region.inner_join(nation, pred)
        expected = region.inner_join(nation, [pred])
        assert_equal(joined, expected)
예제 #29
0
    def test_window_bind_to_table(self):
        w = ibis.window(group_by='g', order_by=ibis.desc('f'))

        w2 = w.bind(self.t)
        expected = ibis.window(group_by=self.t.g,
                               order_by=ibis.desc(self.t.f))

        assert_equal(w2, expected)
예제 #30
0
    def test_getitem_slice(self):
        cases = [
            (self.table.g[:3], self.table.g.substr(0, 3)),
            (self.table.g[2:6], self.table.g.substr(2, 4)),
        ]

        for case, expected in cases:
            assert_equal(case, expected)
예제 #31
0
파일: test_table.py 프로젝트: qwshy/ibis
 def test_projection_array_expr(self):
     result = self.table[self.table.a]
     expected = self.table[[self.table.a]]
     assert_equal(result, expected)
예제 #32
0
def test_sql_with_limit(con):
    table = con.sql("SELECT * FROM functional_alltypes LIMIT 10")
    ex_schema = con.get_schema('functional_alltypes')
    assert_equal(table.schema(), ex_schema)
예제 #33
0
파일: test_table.py 프로젝트: tonyfast/ibis
def test_add_column_proxies_to_mutate(table):
    result = table.add_column(ibis.now().cast('date'), name='date')
    expected = table.mutate(date=ibis.now().cast('date'))
    assert_equal(result, expected)
예제 #34
0
def test_get_schema(con, test_data_db):
    t = con.table('tpch_lineitem')
    schema = con.get_schema('tpch_lineitem', database=test_data_db)
    assert_equal(t.schema(), schema)
예제 #35
0
파일: test_table.py 프로젝트: qwshy/ibis
 def test_projection_convenient_syntax(self):
     proj = self.table[self.table, self.table['a'].name('foo')]
     proj2 = self.table[[self.table, self.table['a'].name('foo')]]
     assert_equal(proj, proj2)
예제 #36
0
def test_projection_self(table):
    result = table[table]
    expected = table.projection(table)

    assert_equal(result, expected)
예제 #37
0
def test_topk_function_late_bind(airlines):
    # GH #520
    expr1 = airlines.dest.topk(5, by=lambda x: x.arrdelay.mean())
    expr2 = airlines.dest.topk(5, by=airlines.arrdelay.mean())

    assert_equal(expr1.to_aggregation(), expr2.to_aggregation())
예제 #38
0
def test_distinct_count(dtable):
    result = dtable.string_col.distinct().count()
    expected = dtable.string_col.nunique().name('count')
    assert_equal(result, expected)
    assert isinstance(result.op(), ops.CountDistinct)
예제 #39
0
def test_projection_convenient_syntax(table):
    proj = table[table, table['a'].name('foo')]
    proj2 = table[[table, table['a'].name('foo')]]
    assert_equal(proj, proj2)
예제 #40
0
def test_projection_no_list(table):
    expr = (table.f * 2).name('bar')
    result = table.select(expr)
    expected = table.projection([expr])
    assert_equal(result, expected)
예제 #41
0
 def test_get_schema(self):
     t = self.con.table('tpch_lineitem')
     schema = self.con.get_schema('tpch_lineitem',
                                  database=self.test_data_db)
     assert_equal(t.schema(), schema)
예제 #42
0
def test_value_counts_unnamed_expr(con):
    nation = con.table('tpch_nation')

    expr = nation.n_name.lower().value_counts()
    expected = nation.n_name.lower().name('unnamed').value_counts()
    assert_equal(expr, expected)
예제 #43
0
def test_value_counts_convenience(table):
    # #152
    result = table.g.value_counts()
    expected = table.group_by('g').aggregate(table.count().name('count'))

    assert_equal(result, expected)
예제 #44
0
def test_lineage(companies):
    # single table dependency
    funding_buckets = [
        0,
        1000000,
        10000000,
        50000000,
        100000000,
        500000000,
        1000000000,
    ]

    bucket = companies.funding_total_usd.bucket(
        funding_buckets, include_over=True
    )

    mutated = companies.mutate(
        bucket=bucket, status=companies.status.fillna('Unknown')
    )

    filtered = mutated[
        (companies.founded_at > '2010-01-01') | companies.founded_at.isnull()
    ]

    grouped = filtered.group_by(['bucket', 'status']).size()

    results = list(lin.lineage(bucket))
    expected = [bucket, companies.funding_total_usd, companies]
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(mutated.bucket))
    expected = [
        mutated.bucket,
        mutated,
        bucket.name('bucket'),
        bucket,
        companies.funding_total_usd,
        companies,
    ]
    assert len(results) == len(expected)
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(filtered.bucket))
    expected = [
        filtered.bucket,
        filtered,
        bucket.name('bucket'),
        bucket,
        companies.funding_total_usd,
        companies,
    ]
    assert len(results) == len(expected)
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(grouped.bucket))
    expected = [
        grouped.bucket,
        grouped,
        filtered.bucket,
        filtered,
        bucket.name('bucket'),
        bucket,
        companies.funding_total_usd,
        companies,
    ]
    assert len(results) == len(expected)
    for r, e in zip(results, expected):
        assert_equal(r, e)
예제 #45
0
파일: test_table.py 프로젝트: qwshy/ibis
    def test_projection_self(self):
        result = self.table[self.table]
        expected = self.table.projection(self.table)

        assert_equal(result, expected)
예제 #46
0
    def test_contains(self):
        expr = self.table.g.contains('foo')
        expected = self.table.g.like('%foo%')
        assert_equal(expr, expected)

        self.assertRaises(Exception, lambda: 'foo' in self.table.g)
예제 #47
0
def test_projection_array_expr(table):
    result = table[table.a]
    expected = table[[table.a]]
    assert_equal(result, expected)
예제 #48
0
def test_groupby_alias(table):
    t = table

    result = t.groupby('g').size()
    expected = t.group_by('g').size()
    assert_equal(result, expected)
예제 #49
0
def test_lineage(companies):
    # single table dependency
    funding_buckets = [
        0,
        1000000,
        10000000,
        50000000,
        100000000,
        500000000,
        1000000000,
    ]

    bucket_names = [
        '0 to 1m',
        '1m to 10m',
        '10m to 50m',
        '50m to 100m',
        '100m to 500m',
        '500m to 1b',
        'Over 1b',
    ]

    bucket = companies.funding_total_usd.bucket(
        funding_buckets, include_over=True
    )

    mutated = companies.mutate(
        bucket=bucket, status=companies.status.fillna('Unknown')
    )

    filtered = mutated[
        (companies.founded_at > '2010-01-01') | companies.founded_at.isnull()
    ]

    grouped = filtered.group_by(['bucket', 'status']).size()

    # TODO(cpcloud): Should this be used?
    joined = grouped.mutate(  # noqa
        bucket_name=lambda x: x.bucket.label(bucket_names).fillna('Unknown')
    )

    results = list(lin.lineage(bucket))
    expected = [bucket, companies.funding_total_usd, companies]
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(mutated.bucket))
    expected = [
        mutated.bucket,
        mutated,
        bucket.name('bucket'),
        companies.funding_total_usd,
        companies,
    ]
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(filtered.bucket))
    expected = [
        filtered.bucket,
        filtered,
        bucket.name('bucket'),
        companies.funding_total_usd,
        companies,
    ]
    for r, e in zip(results, expected):
        assert_equal(r, e)

    results = list(lin.lineage(grouped.bucket))
    expected = [
        grouped.bucket,
        grouped,
        filtered.bucket,
        filtered,
        bucket.name('bucket'),
        companies.funding_total_usd,
        companies,
    ]
    for r, e in zip(results, expected):
        assert_equal(r, e)
예제 #50
0
def test_filter_no_list(table):
    pred = table.a > 5

    result = table.filter(pred)
    expected = table[pred]
    assert_equal(result, expected)
예제 #51
0
def test_distinct_count(functional_alltypes):
    result = functional_alltypes.string_col.distinct().count()
    expected = functional_alltypes.string_col.nunique().name('count')
    assert_equal(result, expected)
    assert isinstance(result.op(), ops.CountDistinct)