Пример #1
0
def test_create_table_with_partition_column(con, temp_table_db):
    schema = ibis.schema(
        [
            ('year', 'int32'),
            ('month', 'string'),
            ('day', 'int8'),
            ('value', 'double'),
        ]
    )

    tmp_db, name = temp_table_db
    con.create_table(
        name, schema=schema, database=tmp_db, partition=['year', 'month']
    )

    # the partition column get put at the end of the table
    ex_schema = ibis.schema(
        [
            ('day', 'int8'),
            ('value', 'double'),
            ('year', 'int32'),
            ('month', 'string'),
        ]
    )
    table_schema = con.get_schema(name, database=tmp_db)
    assert_equal(table_schema, ex_schema)

    partition_schema = con.database(tmp_db).table(name).partition_schema()

    expected = ibis.schema([('year', 'int32'), ('month', 'string')])
    assert_equal(partition_schema, expected)
Пример #2
0
def create_parquet_tables(con):
    parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'parquet'))
    schemas = {
        'functional_alltypes': ibis.schema(
            [('id', 'int32'),
             ('bool_col', 'boolean'),
             ('tinyint_col', 'int8'),
             ('smallint_col', 'int16'),
             ('int_col', 'int32'),
             ('bigint_col', 'int64'),
             ('float_col', 'float'),
             ('double_col', 'double'),
             ('date_string_col', 'string'),
             ('string_col', 'string'),
             ('timestamp_col', 'timestamp'),
             ('year', 'int32'),
             ('month', 'int32')]),
        'tpch_region': ibis.schema(
            [('r_regionkey', 'int16'),
             ('r_name', 'string'),
             ('r_comment', 'string')])}
    tables = []
    for path in parquet_files:
        head, table_name = osp.split(path)
        print('Creating {0}'.format(table_name))
        # if no schema infer!
        schema = schemas.get(table_name)
        table = con.parquet_file(path, schema=schema, name=table_name,
                                 database=ENV.test_data_db, persist=True)
        tables.append(table)
    return tables
Пример #3
0
    def test_create_table_with_partition_column(self):
        schema = ibis.schema([('year', 'int32'),
                              ('month', 'int8'),
                              ('day', 'int8'),
                              ('value', 'double')])

        name = _tmp_name()
        self.con.create_table(name, schema=schema,
                              database=self.tmp_db,
                              partition=['year', 'month'],
                              location=self._temp_location())
        self.temp_tables.append(name)

        # the partition column get put at the end of the table
        ex_schema = ibis.schema([('day', 'int8'),
                                 ('value', 'double'),
                                 ('year', 'int32'),
                                 ('month', 'int8')])
        table_schema = self.con.get_schema(name, database=self.tmp_db)
        assert_equal(table_schema, ex_schema)

        partition_schema = self.db.table(name).partition_schema()

        expected = ibis.schema([('year', 'int32'),
                                ('month', 'int8')])
        assert_equal(partition_schema, expected)
Пример #4
0
def create_parquet_tables(con):
    parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, "parquet"))
    schemas = {
        "functional_alltypes": ibis.schema(
            [
                ("id", "int32"),
                ("bool_col", "boolean"),
                ("tinyint_col", "int8"),
                ("smallint_col", "int16"),
                ("int_col", "int32"),
                ("bigint_col", "int64"),
                ("float_col", "float"),
                ("double_col", "double"),
                ("date_string_col", "string"),
                ("string_col", "string"),
                ("timestamp_col", "timestamp"),
                ("year", "int32"),
                ("month", "int32"),
            ]
        ),
        "tpch_region": ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")]),
    }

    tables = []

    for path in parquet_files:
        head, table_name = posixpath.split(path)
        print("Creating {0}".format(table_name))
        # if no schema infer!
        schema = schemas.get(table_name)
        t = con.parquet_file(path, schema=schema, name=table_name, database=ENV.test_data_db, persist=True)
        tables.append(t)

    return tables
Пример #5
0
def test_schema_subset():
    s1 = ibis.schema([('a', dt.int64), ('b', dt.int32), ('c', dt.string)])

    s2 = ibis.schema([('a', dt.int64), ('c', dt.string)])

    assert s1 > s2
    assert s2 < s1

    assert s1 >= s2
    assert s2 <= s1
Пример #6
0
    def test_create_table_with_partition_column(self):
        schema = ibis.schema([("year", "int32"), ("month", "int8"), ("day", "int8"), ("value", "double")])

        name = util.guid()
        self.con.create_table(name, schema=schema, partition=["year", "month"])
        self.temp_tables.append(name)

        # the partition column get put at the end of the table
        ex_schema = ibis.schema([("day", "int8"), ("value", "double"), ("year", "int32"), ("month", "int8")])
        table_schema = self.con.get_schema(name)
        assert_equal(table_schema, ex_schema)

        partition_schema = self.con.get_partition_schema(name)
        expected = ibis.schema([("year", "int32"), ("month", "int8")])
        assert_equal(partition_schema, expected)
Пример #7
0
def pandas_to_ibis_schema(frame):
    # no analog for decimal in pandas
    pairs = []
    for col_name in frame:
        ibis_type = pandas_col_to_ibis_type(frame[col_name])
        pairs.append((col_name, ibis_type))
    return ibis.schema(pairs)
Пример #8
0
def test_create_table_schema(con):
    t_name = 'mytable'

    con.drop_table(t_name, force=True)

    schema = ibis.schema(
        [
            ('a', 'float'),
            ('b', 'double'),
            ('c', 'int32'),
            ('d', 'int64'),
            ('x', 'point'),
            ('y', 'linestring'),
            ('z', 'polygon'),
            ('w', 'multipolygon'),
        ]
    )

    con.create_table(t_name, schema=schema)

    try:
        t = con.table(t_name)

        assert isinstance(t.a, ir.FloatingColumn)
        assert isinstance(t.b, ir.FloatingColumn)
        assert isinstance(t.c, ir.IntegerColumn)
        assert isinstance(t.d, ir.IntegerColumn)
        assert isinstance(t.x, ir.PointColumn)
        assert isinstance(t.y, ir.LineStringColumn)
        assert isinstance(t.z, ir.PolygonColumn)
        assert isinstance(t.w, ir.MultiPolygonColumn)
    finally:
        con.drop_table(t_name)
Пример #9
0
def test_create_table_parquet_with_schema():
    directory = '/path/to/'

    schema = ibis.schema(
        [('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]
    )

    statement = ddl.CreateTableParquet(
        'new_table',
        directory,
        schema=schema,
        external=True,
        can_exist=True,
        database='foo',
    )

    result = statement.compile()
    expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(
        directory
    )

    assert result == expected
Пример #10
0
    def test_create_external_ddl(self):
        schema = ibis.schema(
            [('key1', 'int32'), ('key2', 'int64'), ('value1', 'double')]
        )

        stmt = ksupport.CreateTableKudu(
            'impala_name',
            'kudu_name',
            ['master1.d.com:7051', 'master2.d.com:7051'],
            schema,
            ['key1', 'key2'],
        )

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE `impala_name`
(`key1` int,
 `key2` bigint,
 `value1` double)
TBLPROPERTIES (
  'kudu.key_columns'='key1, key2',
  'kudu.master_addresses'='master1.d.com:7051, master2.d.com:7051',
  'kudu.table_name'='kudu_name',
  'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
)"""
        assert result == expected
Пример #11
0
def test_add_partition_string_key():
    part_schema = ibis.schema([('foo', 'int32'), ('bar', 'string')])
    stmt = ddl.AddPartition('tbl', {'foo': 5, 'bar': 'qux'}, part_schema)

    result = stmt.compile()
    expected = 'ALTER TABLE tbl ADD PARTITION (foo=5, bar="qux")'
    assert result == expected
Пример #12
0
def test_create_table_with_location_compile():
    path = '/path/to/table'
    schema = ibis.schema(
        [('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]
    )
    statement = ddl.CreateTableWithSchema(
        'another_table',
        schema,
        can_exist=False,
        format='parquet',
        path=path,
        database='foo',
    )
    result = statement.compile()

    expected = """\
CREATE TABLE foo.`another_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(
        path
    )
    assert result == expected
Пример #13
0
    def create_table(self, name, expr=None, schema=None, database=None):
        if database is not None and database != self.engine.url.database:
            raise NotImplementedError(
                'Creating tables from a different database is not yet '
                'implemented'
            )

        if expr is None and schema is None:
            raise ValueError('You must pass either an expression or a schema')

        if expr is not None and schema is not None:
            if not expr.schema().equals(ibis.schema(schema)):
                raise TypeError(
                    'Expression schema is not equal to passed schema. '
                    'Try passing the expression without the schema'
                )
        if schema is None:
            schema = expr.schema()

        self._schemas[self._fully_qualified_name(name, database)] = schema
        t = table_from_schema(name, self.meta, schema)

        with self.begin() as bind:
            t.create(bind=bind)
            if expr is not None:
                bind.execute(
                    t.insert().from_select(list(expr.columns), expr.compile())
                )
Пример #14
0
    def test_kudu_schema_convert(self):
        spec = [
            # name, type, is_nullable, is_primary_key
            ('a', dt.Int8(False), 'int8', False, True),
            ('b', dt.Int16(False), 'int16', False, True),
            ('c', dt.Int32(False), 'int32', False, False),
            ('d', dt.Int64(True), 'int64', True, False),
            ('e', dt.String(True), 'string', True, False),
            ('f', dt.Boolean(False), 'bool', False, False),
            ('g', dt.Float(False), 'float', False, False),
            ('h', dt.Double(True), 'double', True, False),
            # TODO
            # ('i', 'binary', False, False),
            ('j', dt.Timestamp(True), 'timestamp', True, False),
        ]

        builder = kudu.schema_builder()
        primary_keys = []
        ibis_types = []
        for name, itype, type_, is_nullable, is_primary_key in spec:
            builder.add_column(name, type_, nullable=is_nullable)

            if is_primary_key:
                primary_keys.append(name)

            ibis_types.append((name, itype))

        builder.set_primary_keys(primary_keys)
        kschema = builder.build()

        ischema = ksupport.schema_kudu_to_ibis(kschema)
        expected = ibis.schema(ibis_types)

        assert_equal(ischema, expected)
Пример #15
0
def test_add_drop_partition_owned_by_impala(hdfs, con, temp_table):
    schema = ibis.schema(
        [('foo', 'string'), ('year', 'int32'), ('month', 'int16')]
    )
    name = temp_table
    con.create_table(name, schema=schema, partition=['year', 'month'])

    table = con.table(name)

    part = {'year': 2007, 'month': 4}

    subdir = util.guid()
    basename = util.guid()
    path = '/tmp/{}/{}'.format(subdir, basename)

    hdfs.mkdir('/tmp/{}'.format(subdir))
    hdfs.chown('/tmp/{}'.format(subdir), owner='impala', group='supergroup')

    table.add_partition(part, location=path)

    assert len(table.partitions()) == 2

    table.drop_partition(part)

    assert len(table.partitions()) == 1
Пример #16
0
def test_is_partitioned(con, temp_table):
    schema = ibis.schema(
        [('foo', 'string'), ('year', 'int32'), ('month', 'string')]
    )
    name = temp_table
    con.create_table(name, schema=schema, partition=['year', 'month'])
    assert con.table(name).is_partitioned
Пример #17
0
def test_apply_to_schema_with_timezone():
    data = {'time': pd.date_range('2018-01-01', '2018-01-02', freq='H')}
    df = pd.DataFrame(data)
    expected = df.assign(time=df.time.astype('datetime64[ns, EST]'))
    desired_schema = ibis.schema([('time', 'timestamp("EST")')])
    result = desired_schema.apply_to(df.copy())
    tm.assert_frame_equal(expected, result)
Пример #18
0
    def test_create_table_delimited(self):
        path = '/path/to/files/'
        schema = ibis.schema([('a', 'string'),
                              ('b', 'int32'),
                              ('c', 'double'),
                              ('d', 'decimal(12,2)')])

        stmt = ddl.CreateTableDelimited('new_table', path, schema,
                                        delimiter='|',
                                        escapechar='\\',
                                        lineterminator='\0',
                                        database='foo',
                                        can_exist=True)

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12,2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(path)
        assert result == expected
Пример #19
0
    def test_create_table_delimited(self):
        path = "/path/to/files/"
        schema = ibis.schema([("a", "string"), ("b", "int32"), ("c", "double"), ("d", "decimal(12,2)")])

        stmt = ddl.CreateTableDelimited(
            "new_table",
            path,
            schema,
            delimiter="|",
            escapechar="\\",
            lineterminator="\0",
            database="foo",
            can_exist=True,
        )

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12,2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(
            path
        )
        assert result == expected
Пример #20
0
def test_filter_with_analytic():
    x = ibis.table(ibis.schema([('col', 'int32')]), 'x')
    with_filter_col = x[x.columns + [ibis.null().name('filter')]]
    filtered = with_filter_col[with_filter_col['filter'].isnull()]
    subquery = filtered[filtered.columns]

    with_analytic = subquery[['col', subquery.count().name('analytic')]]
    expr = with_analytic[with_analytic.columns]

    result = ibis.impala.compile(expr)
    expected = """\
SELECT `col`, `analytic`
FROM (
  SELECT `col`, count(*) OVER () AS `analytic`
  FROM (
    SELECT `col`, `filter`
    FROM (
      SELECT *
      FROM (
        SELECT `col`, NULL AS `filter`
        FROM x
      ) t3
      WHERE `filter` IS NULL
    ) t2
  ) t1
) t0"""

    assert result == expected
Пример #21
0
    def test_sqla_schema_conversion(self):
        typespec = [
            # name, type, nullable
            ("smallint", sat.SmallInteger, False, dt.int16),
            ("int", sat.Integer, True, dt.int32),
            ("integer", sat.INTEGER(), True, dt.int64),
            ("bigint", sat.BigInteger, False, dt.int64),
            ("real", sat.REAL, True, dt.double),
            ("bool", sat.Boolean, True, dt.boolean),
            ("timestamp", sat.DateTime, True, dt.timestamp),
        ]

        sqla_types = []
        ibis_types = []
        for name, t, nullable, ibis_type in typespec:
            sqla_type = sa.Column(name, t, nullable=nullable)
            sqla_types.append(sqla_type)
            ibis_types.append((name, ibis_type(nullable)))

        table = sa.Table("tname", self.meta, *sqla_types)

        schema = alch.schema_from_table(table)
        expected = ibis.schema(ibis_types)

        assert_equal(schema, expected)
Пример #22
0
def test_timestamp_with_timezone():
    df = pd.DataFrame(
        {'A': pd.date_range('20130101', periods=3, tz='US/Eastern')}
    )
    schema = sch.infer(df)
    expected = ibis.schema([('A', "timestamp('US/Eastern')")])
    assert schema.equals(expected)
    assert schema.types[0].equals(dt.Timestamp('US/Eastern'))
Пример #23
0
 def test_dtype_datetime64(self):
     df = pd.DataFrame({
         'col': [pd.Timestamp('2010-11-01 00:01:00'),
                 pd.Timestamp('2010-11-01 00:02:00.1000'),
                 pd.Timestamp('2010-11-01 00:03:00.300000')]})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'timestamp')])
     assert inferred == expected
Пример #24
0
 def test_dtype_timedelta64(self):
     df = pd.DataFrame({
         'col': [pd.Timedelta('1 days'),
                 pd.Timedelta('-1 days 2 min 3us'),
                 pd.Timedelta('-2 days +23:57:59.999997')]})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int64')])
     assert inferred == expected
Пример #25
0
    def test_query_parquet_file_like_table(self):
        hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region")

        ex_schema = ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")])

        table = self.con.parquet_file(hdfs_path, like_table="tpch_region")

        assert_equal(table.schema(), ex_schema)
Пример #26
0
 def test_is_partitioned(self):
     schema = ibis.schema([('foo', 'string'),
                           ('year', 'int32'),
                           ('month', 'int16')])
     name = _tmp_name()
     self.db.create_table(name, schema=schema,
                          partition=['year', 'month'])
     assert self.db.table(name).is_partitioned
Пример #27
0
def create_parquet_tables(con, executor):
    def create_table(table_name):
        logger.info('Creating %s', table_name)
        schema = schemas.get(table_name)
        path = os.path.join(ENV.test_data_dir, 'parquet', table_name)
        table = con.parquet_file(
            path,
            schema=schema,
            name=table_name,
            database=ENV.test_data_db,
            persist=True,
        )
        return table

    parquet_files = con.hdfs.ls(os.path.join(ENV.test_data_dir, 'parquet'))
    schemas = {
        'functional_alltypes': ibis.schema(
            [
                ('id', 'int32'),
                ('bool_col', 'boolean'),
                ('tinyint_col', 'int8'),
                ('smallint_col', 'int16'),
                ('int_col', 'int32'),
                ('bigint_col', 'int64'),
                ('float_col', 'float'),
                ('double_col', 'double'),
                ('date_string_col', 'string'),
                ('string_col', 'string'),
                ('timestamp_col', 'timestamp'),
                ('year', 'int32'),
                ('month', 'int32'),
            ]
        ),
        'tpch_region': ibis.schema(
            [
                ('r_regionkey', 'int16'),
                ('r_name', 'string'),
                ('r_comment', 'string'),
            ]
        ),
    }
    return (
        executor.submit(create_table, table_name)
        for table_name in parquet_files
    )
Пример #28
0
    def test_query_parquet_file_like_table(self):
        hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')

        ex_schema = ibis.schema([('r_regionkey', 'int16'),
                                 ('r_name', 'string'),
                                 ('r_comment', 'string')])

        table = self.con.parquet_file(hdfs_path, like_table='tpch_region')

        assert_equal(table.schema(), ex_schema)
Пример #29
0
    def test_query_parquet_infer_schema(self):
        hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region")
        table = self.con.parquet_file(hdfs_path)

        # NOTE: the actual schema should have an int16, but bc this is being
        # inferred from a parquet file, which has no notion of int16, the
        # inferred schema will have an int32 instead.
        ex_schema = ibis.schema([("r_regionkey", "int32"), ("r_name", "string"), ("r_comment", "string")])

        assert_equal(table.schema(), ex_schema)
Пример #30
0
    def test_create_partitioned_separate_schema(self):
        schema = ibis.schema([('day', 'int8'),
                              ('value', 'double')])
        part_schema = ibis.schema([('year', 'int32'),
                                   ('month', 'int8')])

        name = _tmp_name()
        self.con.create_table(name, schema=schema, partition=part_schema)
        self.temp_tables.append(name)

        # the partition column get put at the end of the table
        ex_schema = ibis.schema([('day', 'int8'),
                                 ('value', 'double'),
                                 ('year', 'int32'),
                                 ('month', 'int8')])
        table_schema = self.con.get_schema(name)
        assert_equal(table_schema, ex_schema)

        partition_schema = self.con.table(name).partition_schema()
        assert_equal(partition_schema, part_schema)
Пример #31
0
    def test_create_table_with_location(self):
        path = '/path/to/table'
        schema = ibis.schema([('foo', 'string'),
                              ('bar', 'int8'),
                              ('baz', 'int16')])
        statement = ddl.CreateTableWithSchema('another_table', schema,
                                              can_exist=False,
                                              format='parquet',
                                              path=path, database='foo')
        result = statement.compile()

        expected = """\
CREATE TABLE foo.`another_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(path)
        assert result == expected
Пример #32
0
def test_load_data_sqlalchemy(backend, con, temp_table):
    sch = ibis.schema([
        ('first_name', 'string'),
        ('last_name', 'string'),
        ('department_name', 'string'),
        ('salary', 'float64'),
    ])

    df = pd.DataFrame({
        'first_name': ['A', 'B', 'C'],
        'last_name': ['D', 'E', 'F'],
        'department_name': ['AA', 'BB', 'CC'],
        'salary': [100.0, 200.0, 300.0],
    })
    con.create_table(temp_table, schema=sch)
    con.load_data(temp_table, df, if_exists='append')
    result = con.table(temp_table).execute()

    backend.assert_frame_equal(df, result)
Пример #33
0
 def batting(self) -> ir.TableExpr:
     schema = ibis.schema(
         [
             ('lgID', dt.string),
             ('G', dt.float64),
             ('AB', dt.float64),
             ('R', dt.float64),
             ('H', dt.float64),
             ('X2B', dt.float64),
             ('X3B', dt.float64),
             ('HR', dt.float64),
             ('RBI', dt.float64),
             ('SB', dt.float64),
             ('CS', dt.float64),
             ('BB', dt.float64),
             ('SO', dt.float64),
         ]
     )
     return self.connection.table('batting', schema=schema)
Пример #34
0
def test_nullable_input_output(con, backend, temp_table):
    # - Impala, PySpark and Spark non-nullable issues #2138 and #2137
    if not hasattr(con, 'create_table') or not hasattr(con, 'drop_table'):
        pytest.xfail(
            '{} backend doesn\'t have create_table or drop_table methods.')

    sch = ibis.schema([
        ('foo', 'int64'),
        ('bar', ibis.expr.datatypes.int64(nullable=False)),
        ('baz', 'boolean*'),
    ])

    con.create_table(temp_table, schema=sch)

    t = con.table(temp_table)

    assert t.schema().types[0].nullable
    assert not t.schema().types[1].nullable
    assert t.schema().types[2].nullable
Пример #35
0
    def test_add_drop_partition_no_location(self):
        schema = ibis.schema([('foo', 'string'), ('year', 'int32'),
                              ('month', 'int16')])
        name = _tmp_name()
        self.db.create_table(name, schema=schema, partition=['year', 'month'])

        table = self.db.table(name)

        part = {'year': 2007, 'month': 4}

        table.add_partition(part)

        assert len(table.partitions()) == 2

        table.drop_partition(part)

        assert len(table.partitions()) == 1

        table.drop()
Пример #36
0
    def test_query_delimited_file_directory(self):
        hdfs_path = pjoin(self.test_data_dir, 'csv')

        schema = ibis.schema([('foo', 'string'),
                              ('bar', 'double'),
                              ('baz', 'int8')])
        name = 'delimited_table_test1'
        table = self.con.delimited_file(hdfs_path, schema, name=name,
                                        database=self.tmp_db,
                                        delimiter=',')
        try:
            expr = (table
                    [table.bar > 0]
                    .group_by('foo')
                    .aggregate([table.bar.sum().name('sum(bar)'),
                                table.baz.sum().name('mean(baz)')]))
            expr.execute()
        finally:
            self.con.drop_table(name, database=self.tmp_db)
Пример #37
0
    def test_add_drop_partition(self):
        pytest.skip('HIVE-12613')
        schema = ibis.schema([('foo', 'string'), ('year', 'int32'),
                              ('month', 'int16')])
        name = _tmp_name()
        self.db.create_table(name, schema=schema, partition=['year', 'month'])

        table = self.db.table(name)

        part = {'year': 2007, 'month': 4}

        path = '/tmp/tmp-{0}'.format(util.guid())
        table.add_partition(part, location=path)

        assert len(table.partitions()) == 2

        table.drop_partition(part)

        assert len(table.partitions()) == 1
Пример #38
0
def test_add_drop_partition_hive_bug(con, temp_table):
    schema = ibis.schema([('foo', 'string'), ('year', 'int32'),
                          ('month', 'int16')])
    name = temp_table
    con.create_table(name, schema=schema, partition=['year', 'month'])

    table = con.table(name)

    part = {'year': 2007, 'month': 4}

    path = '/tmp/{}'.format(util.guid())

    table.add_partition(part, location=path)

    assert len(table.partitions()) == 2

    table.drop_partition(part)

    assert len(table.partitions()) == 1
Пример #39
0
def test_query_parquet_file_with_schema(con, test_data_dir):
    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')

    schema = ibis.schema([
        ('r_regionkey', 'int16'),
        ('r_name', 'string'),
        ('r_comment', 'string'),
    ])

    table = con.parquet_file(hdfs_path, schema=schema)

    name = table.op().name

    # table exists
    con.table(name)

    expr = table.r_name.value_counts()
    expr.execute()

    assert table.count().execute() == 5
Пример #40
0
def test_persist_parquet_file_with_name(con, test_data_dir, temp_table_db):
    import gc

    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')

    tmp_db, name = temp_table_db
    schema = ibis.schema([
        ('r_regionkey', 'int16'),
        ('r_name', 'string'),
        ('r_comment', 'string'),
    ])
    con.parquet_file(hdfs_path,
                     schema=schema,
                     name=name,
                     database=tmp_db,
                     persist=True)
    gc.collect()

    # table still exists
    con.table(name, database=tmp_db)
Пример #41
0
def test_mutation_fusion_no_overwrite():
    """Test fusion with chained mutation that doesn't overwrite existing
    columns.
    """
    t = ibis.table(ibis.schema([('col', 'int32')]), 't')

    result = t
    result = result.mutate(col1=t['col'] + 1)
    result = result.mutate(col2=t['col'] + 2)
    result = result.mutate(col3=t['col'] + 3)

    first_selection = result

    assert len(result.op().selections) == 4
    assert (first_selection.op().selections[1].equals(
        (t['col'] + 1).name('col1')))
    assert (first_selection.op().selections[2].equals(
        (t['col'] + 2).name('col2')))
    assert (first_selection.op().selections[3].equals(
        (t['col'] + 3).name('col3')))
Пример #42
0
    def create_table(self, name, expr=None, schema=None, database=None):
        if database is not None and database != self.engine.url.database:
            raise NotImplementedError(
                'Creating tables from a different database is not yet '
                'implemented')

        if expr is None and schema is None:
            raise ValueError('You must pass either an expression or a schema')

        if expr is not None and schema is not None:
            if not expr.schema().equals(ibis.schema(schema)):
                raise TypeError(
                    'Expression schema is not equal to passed schema. '
                    'Try passing the expression without the schema')
        t = table_from_schema(name, self.meta, schema or expr.schema())
        with self.con.begin() as bind:
            t.create(bind=bind)
            if expr is not None:
                bind.execute(t.insert().from_select(list(expr.columns),
                                                    expr.compile()))
Пример #43
0
def impala_create_test_database(con, env):
    con.drop_database(env.test_data_db, force=True)
    con.create_database(env.test_data_db)
    con.create_table(
        'alltypes',
        schema=ibis.schema(
            [
                ('a', 'int8'),
                ('b', 'int16'),
                ('c', 'int32'),
                ('d', 'int64'),
                ('e', 'float'),
                ('f', 'double'),
                ('g', 'string'),
                ('h', 'boolean'),
                ('i', 'timestamp'),
            ]
        ),
        database=env.test_data_db,
    )
Пример #44
0
def test_create_table_schema(con, temp_table, properties):
    schema = ibis.schema([
        ('a', 'float'),
        ('b', 'double'),
        ('c', 'int8'),
        ('d', 'int16'),
        ('e', 'int32'),
        ('f', 'int64'),
        ('x', 'point'),
        ('y', 'linestring'),
        ('z', 'polygon'),
        ('w', 'multipolygon'),
    ])

    con.create_table(temp_table, schema=schema, **properties)

    t = con.table(temp_table)

    for k, i_type in t.schema().items():
        assert schema[k] == i_type
Пример #45
0
def get_type(expr):
    try:
        return str(expr.type())
    except (AttributeError, NotImplementedError):
        pass

    try:
        schema = expr.schema()
    except (AttributeError, NotImplementedError):
        try:
            # As a last resort try get the name of the output_type class
            return expr.op().output_type().__name__
        except (AttributeError, NotImplementedError):
            return '\u2205'  # empty set character
    except com.IbisError:
        op = expr.op()
        assert isinstance(op, ops.Join)
        left_table_name = getattr(op.left.op(), 'name', None) or ops.genname()
        left_schema = op.left.schema()
        right_table_name = (
            getattr(op.right.op(), 'name', None) or ops.genname()
        )
        right_schema = op.right.schema()
        pairs = [
            ('{}.{}'.format(left_table_name, left_column), type)
            for left_column, type in left_schema.items()
        ] + [
            ('{}.{}'.format(right_table_name, right_column), type)
            for right_column, type in right_schema.items()
        ]
        schema = ibis.schema(pairs)

    return (
        ''.join(
            '<BR ALIGN="LEFT" />  <I>{}</I>: {}'.format(
                escape(name), escape(str(type))
            )
            for name, type in zip(schema.names, schema.types)
        )
        + '<BR ALIGN="LEFT" />'
    )
Пример #46
0
    def test_load_data_partitioned(self):
        path = '/path/to/data'
        part = {'year': 2007, 'month': 7}
        part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')])
        stmt = ddl.LoadData('functional_alltypes', path,
                            database='foo',
                            partition=part,
                            partition_schema=part_schema)

        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected
Пример #47
0
def test_convert_parquet(parquet_schema):
    strings = [dt.string, dt.string, dt.string]

    # uint32, int8, int16 stored as upcasted types
    types = ([
        dt.uint8,
        dt.uint16,
        dt.int64,
        dt.uint64,
        dt.int16,
        dt.int16,
        dt.int32,
        dt.int64,
        dt.float32,
        dt.float64,
        dt.boolean,
        dt.timestamp,
    ] + strings + [dt.binary, dt.int64])
    names = [
        'uint8',
        'uint16',
        'uint32',
        'uint64',
        'int8',
        'int16',
        'int32',
        'int64',
        'float32',
        'float64',
        'bool',
        'datetime',
        'str',
        'str_with_nulls',
        'empty_str',
        'bytes',
    ]
    expected = ibis.schema(zip(names, types))

    result = ibis.infer_schema(parquet_schema)
    assert result == expected
Пример #48
0
def create_test_database(con):
    if con.exists_database(ENV.test_data_db):
        con.drop_database(ENV.test_data_db, force=True)
    con.create_database(ENV.test_data_db)
    logger.info('Created database %s', ENV.test_data_db)

    con.create_table(
        'alltypes',
        schema=ibis.schema([
            ('a', 'int8'),
            ('b', 'int16'),
            ('c', 'int32'),
            ('d', 'int64'),
            ('e', 'float'),
            ('f', 'double'),
            ('g', 'string'),
            ('h', 'boolean'),
            ('i', 'timestamp'),
        ]),
        database=ENV.test_data_db,
    )
    logger.info('Created empty table %s.`alltypes`', ENV.test_data_db)
Пример #49
0
def test_query_delimited_file_directory(con, test_data_dir, tmp_db):
    hdfs_path = pjoin(test_data_dir, 'csv')

    schema = ibis.schema(
        [('foo', 'string'), ('bar', 'double'), ('baz', 'int8')]
    )
    name = 'delimited_table_test1'
    table = con.delimited_file(
        hdfs_path, schema, name=name, database=tmp_db, delimiter=','
    )

    expr = (
        table[table.bar > 0]
        .group_by('foo')
        .aggregate(
            [
                table.bar.sum().name('sum(bar)'),
                table.baz.sum().name('mean(baz)'),
            ]
        )
    )
    assert expr.execute() is not None
Пример #50
0
def test_load_data_sqlalchemy(backend, con, temp_table):
    if not isinstance(con.dialect(), ibis.sql.alchemy.AlchemyDialect):
        pytest.skip('{} is not a SQL Alchemy Client.'.format(backend.name))

    sch = ibis.schema([
        ('first_name', 'string'),
        ('last_name', 'string'),
        ('department_name', 'string'),
        ('salary', 'float64'),
    ])

    df = pd.DataFrame({
        'first_name': ['A', 'B', 'C'],
        'last_name': ['D', 'E', 'F'],
        'department_name': ['AA', 'BB', 'CC'],
        'salary': [100.0, 200.0, 300.0],
    })
    con.create_table(temp_table, schema=sch)
    con.load_data(temp_table, df, if_exists='append')
    result = con.table(temp_table).execute()

    backend.assert_frame_equal(df, result)
Пример #51
0
    def test_create_external_ddl(self):
        schema = ibis.schema([('key1', 'int32'), ('key2', 'int64'),
                              ('value1', 'double')])

        stmt = ksupport.CreateTableKudu(
            'impala_name', 'kudu_name',
            ['master1.d.com:7051', 'master2.d.com:7051'], schema,
            ['key1', 'key2'])

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE `impala_name`
(`key1` int,
 `key2` bigint,
 `value1` double)
TBLPROPERTIES (
  'kudu.key_columns'='key1, key2',
  'kudu.master_addresses'='master1.d.com:7051, master2.d.com:7051',
  'kudu.table_name'='kudu_name',
  'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
)"""
        assert result == expected
Пример #52
0
def test_convert_parquet(parquet_schema):
    # TODO(jreback)
    # not entirely sure this is correct
    # should these be strings in py2?
    if PY2:
        strings = [dt.binary, dt.binary, dt.binary]
    else:
        strings = [dt.string, dt.string, dt.string]

    # uint32, int8, int16 stored as upcasted types
    types = [
        dt.uint8, dt.uint16, dt.int64, dt.uint64, dt.int16, dt.int16, dt.int32,
        dt.int64, dt.float32, dt.float64, dt.boolean, dt.timestamp
    ] + strings + [dt.binary, dt.int64]
    names = [
        'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32',
        'int64', 'float32', 'float64', 'bool', 'datetime', 'str',
        'str_with_nulls', 'empty_str', 'bytes', '__index_level_0__'
    ]
    expected = ibis.schema(zip(names, types))

    result = ibis.infer_schema(parquet_schema)
    assert result == expected
Пример #53
0
    def test_create_table_parquet_with_schema(self):
        directory = '/path/to/'

        schema = ibis.schema([('foo', 'string'), ('bar', 'int8'),
                              ('baz', 'int16')])

        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           schema=schema,
                                           external=True,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(directory)

        assert result == expected
Пример #54
0
def test_create_table_delimited():
    path = '/path/to/files/'
    schema = ibis.schema(
        [
            ('a', 'string'),
            ('b', 'int32'),
            ('c', 'double'),
            ('d', 'decimal(12, 2)'),
        ]
    )

    stmt = ddl.CreateTableDelimited(
        'new_table',
        path,
        schema,
        delimiter='|',
        escapechar='\\',
        lineterminator='\0',
        database='foo',
        can_exist=True,
    )

    result = stmt.compile()
    expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12, 2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(
        path
    )
    assert result == expected
Пример #55
0
def test_sa_default_numeric_precision_and_scale(con, backend, dialects,
                                                default_precisions,
                                                default_scales):
    # TODO: find a better way to access ibis.sql.alchemy
    import ibis.sql.alchemy as alch

    dialect = dialects[backend.name]
    default_precision = default_precisions[backend.name]
    default_scale = default_scales[backend.name]

    typespec = [
        # name, sqlalchemy type, ibis type
        ('n1', dialect.NUMERIC, dt.Decimal(default_precision, default_scale)),
        ('n2', dialect.NUMERIC(5), dt.Decimal(5, default_scale)),
        ('n3', dialect.NUMERIC(None, 4), dt.Decimal(default_precision, 4)),
        ('n4', dialect.NUMERIC(10, 2), dt.Decimal(10, 2)),
    ]

    sqla_types = []
    ibis_types = []
    for name, t, ibis_type in typespec:
        sqla_type = sa.Column(name, t, nullable=True)
        sqla_types.append(sqla_type)
        ibis_types.append((name, ibis_type(nullable=True)))

    # Create a table with the numeric types.
    table_name = 'test_sa_default_param_decimal'
    engine = con.con
    table = sa.Table(table_name, sa.MetaData(bind=engine), *sqla_types)

    # Check that we can correctly recover the default precision and scale.
    schema = alch.schema_from_table(table)
    expected = ibis.schema(ibis_types)

    assert_equal(schema, expected)
    con.drop_table(table_name, force=True)
Пример #56
0
def test_add_drop_partition_owned_by_impala(hdfs, con, temp_table):
    schema = ibis.schema([('foo', 'string'), ('year', 'int32'),
                          ('month', 'int16')])
    name = temp_table
    con.create_table(name, schema=schema, partition=['year', 'month'])

    table = con.table(name)

    part = {'year': 2007, 'month': 4}

    subdir = util.guid()
    basename = util.guid()
    path = f'/tmp/{subdir}/{basename}'

    hdfs.mkdir(f'/tmp/{subdir}')
    hdfs.chown(f'/tmp/{subdir}', owner='impala', group='supergroup')

    table.add_partition(part, location=path)

    assert len(table.partitions()) == 2

    table.drop_partition(part)

    assert len(table.partitions()) == 1
Пример #57
0
def test_create_table_schema(con):
    t_name = 'mytable'

    con.drop_table(t_name, force=True)

    schema = ibis.schema([('a', 'float'), ('b', 'double'), ('c', 'int32'),
                          ('d', 'int64'), ('x', 'point'), ('y', 'linestring'),
                          ('z', 'polygon'), ('w', 'multipolygon')])

    con.create_table(t_name, schema=schema)

    try:
        t = con.table(t_name)

        assert isinstance(t.a, ir.FloatingColumn)
        assert isinstance(t.b, ir.FloatingColumn)
        assert isinstance(t.c, ir.IntegerColumn)
        assert isinstance(t.d, ir.IntegerColumn)
        assert isinstance(t.x, ir.PointColumn)
        assert isinstance(t.y, ir.LineStringColumn)
        assert isinstance(t.z, ir.PolygonColumn)
        assert isinstance(t.w, ir.MultiPolygonColumn)
    finally:
        con.drop_table(t_name)
Пример #58
0
def test_read_csv(con, temp_table, filename):
    schema = ibis.schema(
        [
            ('index', 'int64'),
            ('Unnamed__0', 'int64'),
            ('id', 'int32'),
            ('bool_col', 'bool'),
            ('tinyint_col', 'int16'),
            ('smallint_col', 'int16'),
            ('int_col', 'int32'),
            ('bigint_col', 'int64'),
            ('float_col', 'float32'),
            ('double_col', 'double'),
            ('date_string_col', 'string'),
            ('string_col', 'string'),
            ('timestamp_col', 'timestamp'),
            ('year_', 'int32'),
            ('month_', 'int32'),
        ]
    )
    con.create_table(temp_table, schema=schema)

    # prepare csv file inside omnisci docker container
    # if the file exists, then it will be overwritten
    con._execute(
        "COPY (SELECT * FROM functional_alltypes) TO '{}'".format(filename)
    )

    db = con.database()
    table = db.table(temp_table)
    table.read_csv(filename, header=False, quotechar='"', delimiter=",")

    df_read_csv = table.execute()
    df_expected = db.table("functional_alltypes").execute()

    pd.testing.assert_frame_equal(df_expected, df_read_csv)
Пример #59
0
def test_sqla_schema_conversion(con):
    typespec = [
        # name, type, nullable
        ('smallint', sat.SmallInteger, False, dt.int16),
        ('int', sat.Integer, True, dt.int32),
        ('integer', sat.INTEGER(), True, dt.int32),
        ('bigint', sat.BigInteger, False, dt.int64),
        ('real', sat.REAL, True, dt.float32),
        ('bool', sat.Boolean, True, dt.bool),
        ('timestamp', sat.DateTime, True, dt.timestamp),
    ]

    sqla_types = []
    ibis_types = []
    for name, t, nullable, ibis_type in typespec:
        sqla_types.append(sa.Column(name, t, nullable=nullable))
        ibis_types.append((name, ibis_type(nullable=nullable)))

    table = sa.Table('tname', con.meta, *sqla_types)

    schema = schema_from_table(table)
    expected = ibis.schema(ibis_types)

    assert_equal(schema, expected)
Пример #60
0
def test_query_schema(backend, con, alltypes, expr_fn, expected):
    if not hasattr(con, '_build_ast'):
        pytest.skip(
            '{} backend has no _build_ast method'.format(
                type(backend).__name__
            )
        )

    expr = expr_fn(alltypes)

    # we might need a public API for it
    ast = con._build_ast(expr, backend.make_context())
    query = con.query_class(con, ast)
    schema = query.schema()

    # clickhouse columns has been defined as non-nullable
    # whereas other backends don't support non-nullable columns yet
    expected = ibis.schema(
        [
            (name, dtype(nullable=schema[name].nullable))
            for name, dtype in expected
        ]
    )
    assert query.schema().equals(expected)