def test_create_table_with_partition_column(con, temp_table_db): schema = ibis.schema( [ ('year', 'int32'), ('month', 'string'), ('day', 'int8'), ('value', 'double'), ] ) tmp_db, name = temp_table_db con.create_table( name, schema=schema, database=tmp_db, partition=['year', 'month'] ) # the partition column get put at the end of the table ex_schema = ibis.schema( [ ('day', 'int8'), ('value', 'double'), ('year', 'int32'), ('month', 'string'), ] ) table_schema = con.get_schema(name, database=tmp_db) assert_equal(table_schema, ex_schema) partition_schema = con.database(tmp_db).table(name).partition_schema() expected = ibis.schema([('year', 'int32'), ('month', 'string')]) assert_equal(partition_schema, expected)
def create_parquet_tables(con): parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'parquet')) schemas = { 'functional_alltypes': ibis.schema( [('id', 'int32'), ('bool_col', 'boolean'), ('tinyint_col', 'int8'), ('smallint_col', 'int16'), ('int_col', 'int32'), ('bigint_col', 'int64'), ('float_col', 'float'), ('double_col', 'double'), ('date_string_col', 'string'), ('string_col', 'string'), ('timestamp_col', 'timestamp'), ('year', 'int32'), ('month', 'int32')]), 'tpch_region': ibis.schema( [('r_regionkey', 'int16'), ('r_name', 'string'), ('r_comment', 'string')])} tables = [] for path in parquet_files: head, table_name = osp.split(path) print('Creating {0}'.format(table_name)) # if no schema infer! schema = schemas.get(table_name) table = con.parquet_file(path, schema=schema, name=table_name, database=ENV.test_data_db, persist=True) tables.append(table) return tables
def test_create_table_with_partition_column(self): schema = ibis.schema([('year', 'int32'), ('month', 'int8'), ('day', 'int8'), ('value', 'double')]) name = _tmp_name() self.con.create_table(name, schema=schema, database=self.tmp_db, partition=['year', 'month'], location=self._temp_location()) self.temp_tables.append(name) # the partition column get put at the end of the table ex_schema = ibis.schema([('day', 'int8'), ('value', 'double'), ('year', 'int32'), ('month', 'int8')]) table_schema = self.con.get_schema(name, database=self.tmp_db) assert_equal(table_schema, ex_schema) partition_schema = self.db.table(name).partition_schema() expected = ibis.schema([('year', 'int32'), ('month', 'int8')]) assert_equal(partition_schema, expected)
def create_parquet_tables(con): parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, "parquet")) schemas = { "functional_alltypes": ibis.schema( [ ("id", "int32"), ("bool_col", "boolean"), ("tinyint_col", "int8"), ("smallint_col", "int16"), ("int_col", "int32"), ("bigint_col", "int64"), ("float_col", "float"), ("double_col", "double"), ("date_string_col", "string"), ("string_col", "string"), ("timestamp_col", "timestamp"), ("year", "int32"), ("month", "int32"), ] ), "tpch_region": ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")]), } tables = [] for path in parquet_files: head, table_name = posixpath.split(path) print("Creating {0}".format(table_name)) # if no schema infer! schema = schemas.get(table_name) t = con.parquet_file(path, schema=schema, name=table_name, database=ENV.test_data_db, persist=True) tables.append(t) return tables
def test_schema_subset(): s1 = ibis.schema([('a', dt.int64), ('b', dt.int32), ('c', dt.string)]) s2 = ibis.schema([('a', dt.int64), ('c', dt.string)]) assert s1 > s2 assert s2 < s1 assert s1 >= s2 assert s2 <= s1
def test_create_table_with_partition_column(self): schema = ibis.schema([("year", "int32"), ("month", "int8"), ("day", "int8"), ("value", "double")]) name = util.guid() self.con.create_table(name, schema=schema, partition=["year", "month"]) self.temp_tables.append(name) # the partition column get put at the end of the table ex_schema = ibis.schema([("day", "int8"), ("value", "double"), ("year", "int32"), ("month", "int8")]) table_schema = self.con.get_schema(name) assert_equal(table_schema, ex_schema) partition_schema = self.con.get_partition_schema(name) expected = ibis.schema([("year", "int32"), ("month", "int8")]) assert_equal(partition_schema, expected)
def pandas_to_ibis_schema(frame): # no analog for decimal in pandas pairs = [] for col_name in frame: ibis_type = pandas_col_to_ibis_type(frame[col_name]) pairs.append((col_name, ibis_type)) return ibis.schema(pairs)
def test_create_table_schema(con): t_name = 'mytable' con.drop_table(t_name, force=True) schema = ibis.schema( [ ('a', 'float'), ('b', 'double'), ('c', 'int32'), ('d', 'int64'), ('x', 'point'), ('y', 'linestring'), ('z', 'polygon'), ('w', 'multipolygon'), ] ) con.create_table(t_name, schema=schema) try: t = con.table(t_name) assert isinstance(t.a, ir.FloatingColumn) assert isinstance(t.b, ir.FloatingColumn) assert isinstance(t.c, ir.IntegerColumn) assert isinstance(t.d, ir.IntegerColumn) assert isinstance(t.x, ir.PointColumn) assert isinstance(t.y, ir.LineStringColumn) assert isinstance(t.z, ir.PolygonColumn) assert isinstance(t.w, ir.MultiPolygonColumn) finally: con.drop_table(t_name)
def test_create_table_parquet_with_schema(): directory = '/path/to/' schema = ibis.schema( [('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')] ) statement = ddl.CreateTableParquet( 'new_table', directory, schema=schema, external=True, can_exist=True, database='foo', ) result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format( directory ) assert result == expected
def test_create_external_ddl(self): schema = ibis.schema( [('key1', 'int32'), ('key2', 'int64'), ('value1', 'double')] ) stmt = ksupport.CreateTableKudu( 'impala_name', 'kudu_name', ['master1.d.com:7051', 'master2.d.com:7051'], schema, ['key1', 'key2'], ) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE `impala_name` (`key1` int, `key2` bigint, `value1` double) TBLPROPERTIES ( 'kudu.key_columns'='key1, key2', 'kudu.master_addresses'='master1.d.com:7051, master2.d.com:7051', 'kudu.table_name'='kudu_name', 'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler' )""" assert result == expected
def test_add_partition_string_key(): part_schema = ibis.schema([('foo', 'int32'), ('bar', 'string')]) stmt = ddl.AddPartition('tbl', {'foo': 5, 'bar': 'qux'}, part_schema) result = stmt.compile() expected = 'ALTER TABLE tbl ADD PARTITION (foo=5, bar="qux")' assert result == expected
def test_create_table_with_location_compile(): path = '/path/to/table' schema = ibis.schema( [('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')] ) statement = ddl.CreateTableWithSchema( 'another_table', schema, can_exist=False, format='parquet', path=path, database='foo', ) result = statement.compile() expected = """\ CREATE TABLE foo.`another_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format( path ) assert result == expected
def create_table(self, name, expr=None, schema=None, database=None): if database is not None and database != self.engine.url.database: raise NotImplementedError( 'Creating tables from a different database is not yet ' 'implemented' ) if expr is None and schema is None: raise ValueError('You must pass either an expression or a schema') if expr is not None and schema is not None: if not expr.schema().equals(ibis.schema(schema)): raise TypeError( 'Expression schema is not equal to passed schema. ' 'Try passing the expression without the schema' ) if schema is None: schema = expr.schema() self._schemas[self._fully_qualified_name(name, database)] = schema t = table_from_schema(name, self.meta, schema) with self.begin() as bind: t.create(bind=bind) if expr is not None: bind.execute( t.insert().from_select(list(expr.columns), expr.compile()) )
def test_kudu_schema_convert(self): spec = [ # name, type, is_nullable, is_primary_key ('a', dt.Int8(False), 'int8', False, True), ('b', dt.Int16(False), 'int16', False, True), ('c', dt.Int32(False), 'int32', False, False), ('d', dt.Int64(True), 'int64', True, False), ('e', dt.String(True), 'string', True, False), ('f', dt.Boolean(False), 'bool', False, False), ('g', dt.Float(False), 'float', False, False), ('h', dt.Double(True), 'double', True, False), # TODO # ('i', 'binary', False, False), ('j', dt.Timestamp(True), 'timestamp', True, False), ] builder = kudu.schema_builder() primary_keys = [] ibis_types = [] for name, itype, type_, is_nullable, is_primary_key in spec: builder.add_column(name, type_, nullable=is_nullable) if is_primary_key: primary_keys.append(name) ibis_types.append((name, itype)) builder.set_primary_keys(primary_keys) kschema = builder.build() ischema = ksupport.schema_kudu_to_ibis(kschema) expected = ibis.schema(ibis_types) assert_equal(ischema, expected)
def test_add_drop_partition_owned_by_impala(hdfs, con, temp_table): schema = ibis.schema( [('foo', 'string'), ('year', 'int32'), ('month', 'int16')] ) name = temp_table con.create_table(name, schema=schema, partition=['year', 'month']) table = con.table(name) part = {'year': 2007, 'month': 4} subdir = util.guid() basename = util.guid() path = '/tmp/{}/{}'.format(subdir, basename) hdfs.mkdir('/tmp/{}'.format(subdir)) hdfs.chown('/tmp/{}'.format(subdir), owner='impala', group='supergroup') table.add_partition(part, location=path) assert len(table.partitions()) == 2 table.drop_partition(part) assert len(table.partitions()) == 1
def test_is_partitioned(con, temp_table): schema = ibis.schema( [('foo', 'string'), ('year', 'int32'), ('month', 'string')] ) name = temp_table con.create_table(name, schema=schema, partition=['year', 'month']) assert con.table(name).is_partitioned
def test_apply_to_schema_with_timezone(): data = {'time': pd.date_range('2018-01-01', '2018-01-02', freq='H')} df = pd.DataFrame(data) expected = df.assign(time=df.time.astype('datetime64[ns, EST]')) desired_schema = ibis.schema([('time', 'timestamp("EST")')]) result = desired_schema.apply_to(df.copy()) tm.assert_frame_equal(expected, result)
def test_create_table_delimited(self): path = '/path/to/files/' schema = ibis.schema([('a', 'string'), ('b', 'int32'), ('c', 'double'), ('d', 'decimal(12,2)')]) stmt = ddl.CreateTableDelimited('new_table', path, schema, delimiter='|', escapechar='\\', lineterminator='\0', database='foo', can_exist=True) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`a` string, `b` int, `c` double, `d` decimal(12,2)) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\' LINES TERMINATED BY '\0' LOCATION '{0}'""".format(path) assert result == expected
def test_create_table_delimited(self): path = "/path/to/files/" schema = ibis.schema([("a", "string"), ("b", "int32"), ("c", "double"), ("d", "decimal(12,2)")]) stmt = ddl.CreateTableDelimited( "new_table", path, schema, delimiter="|", escapechar="\\", lineterminator="\0", database="foo", can_exist=True, ) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`a` string, `b` int, `c` double, `d` decimal(12,2)) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\' LINES TERMINATED BY '\0' LOCATION '{0}'""".format( path ) assert result == expected
def test_filter_with_analytic(): x = ibis.table(ibis.schema([('col', 'int32')]), 'x') with_filter_col = x[x.columns + [ibis.null().name('filter')]] filtered = with_filter_col[with_filter_col['filter'].isnull()] subquery = filtered[filtered.columns] with_analytic = subquery[['col', subquery.count().name('analytic')]] expr = with_analytic[with_analytic.columns] result = ibis.impala.compile(expr) expected = """\ SELECT `col`, `analytic` FROM ( SELECT `col`, count(*) OVER () AS `analytic` FROM ( SELECT `col`, `filter` FROM ( SELECT * FROM ( SELECT `col`, NULL AS `filter` FROM x ) t3 WHERE `filter` IS NULL ) t2 ) t1 ) t0""" assert result == expected
def test_sqla_schema_conversion(self): typespec = [ # name, type, nullable ("smallint", sat.SmallInteger, False, dt.int16), ("int", sat.Integer, True, dt.int32), ("integer", sat.INTEGER(), True, dt.int64), ("bigint", sat.BigInteger, False, dt.int64), ("real", sat.REAL, True, dt.double), ("bool", sat.Boolean, True, dt.boolean), ("timestamp", sat.DateTime, True, dt.timestamp), ] sqla_types = [] ibis_types = [] for name, t, nullable, ibis_type in typespec: sqla_type = sa.Column(name, t, nullable=nullable) sqla_types.append(sqla_type) ibis_types.append((name, ibis_type(nullable))) table = sa.Table("tname", self.meta, *sqla_types) schema = alch.schema_from_table(table) expected = ibis.schema(ibis_types) assert_equal(schema, expected)
def test_timestamp_with_timezone(): df = pd.DataFrame( {'A': pd.date_range('20130101', periods=3, tz='US/Eastern')} ) schema = sch.infer(df) expected = ibis.schema([('A', "timestamp('US/Eastern')")]) assert schema.equals(expected) assert schema.types[0].equals(dt.Timestamp('US/Eastern'))
def test_dtype_datetime64(self): df = pd.DataFrame({ 'col': [pd.Timestamp('2010-11-01 00:01:00'), pd.Timestamp('2010-11-01 00:02:00.1000'), pd.Timestamp('2010-11-01 00:03:00.300000')]}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'timestamp')]) assert inferred == expected
def test_dtype_timedelta64(self): df = pd.DataFrame({ 'col': [pd.Timedelta('1 days'), pd.Timedelta('-1 days 2 min 3us'), pd.Timedelta('-2 days +23:57:59.999997')]}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'int64')]) assert inferred == expected
def test_query_parquet_file_like_table(self): hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region") ex_schema = ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")]) table = self.con.parquet_file(hdfs_path, like_table="tpch_region") assert_equal(table.schema(), ex_schema)
def test_is_partitioned(self): schema = ibis.schema([('foo', 'string'), ('year', 'int32'), ('month', 'int16')]) name = _tmp_name() self.db.create_table(name, schema=schema, partition=['year', 'month']) assert self.db.table(name).is_partitioned
def create_parquet_tables(con, executor): def create_table(table_name): logger.info('Creating %s', table_name) schema = schemas.get(table_name) path = os.path.join(ENV.test_data_dir, 'parquet', table_name) table = con.parquet_file( path, schema=schema, name=table_name, database=ENV.test_data_db, persist=True, ) return table parquet_files = con.hdfs.ls(os.path.join(ENV.test_data_dir, 'parquet')) schemas = { 'functional_alltypes': ibis.schema( [ ('id', 'int32'), ('bool_col', 'boolean'), ('tinyint_col', 'int8'), ('smallint_col', 'int16'), ('int_col', 'int32'), ('bigint_col', 'int64'), ('float_col', 'float'), ('double_col', 'double'), ('date_string_col', 'string'), ('string_col', 'string'), ('timestamp_col', 'timestamp'), ('year', 'int32'), ('month', 'int32'), ] ), 'tpch_region': ibis.schema( [ ('r_regionkey', 'int16'), ('r_name', 'string'), ('r_comment', 'string'), ] ), } return ( executor.submit(create_table, table_name) for table_name in parquet_files )
def test_query_parquet_file_like_table(self): hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region') ex_schema = ibis.schema([('r_regionkey', 'int16'), ('r_name', 'string'), ('r_comment', 'string')]) table = self.con.parquet_file(hdfs_path, like_table='tpch_region') assert_equal(table.schema(), ex_schema)
def test_query_parquet_infer_schema(self): hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region") table = self.con.parquet_file(hdfs_path) # NOTE: the actual schema should have an int16, but bc this is being # inferred from a parquet file, which has no notion of int16, the # inferred schema will have an int32 instead. ex_schema = ibis.schema([("r_regionkey", "int32"), ("r_name", "string"), ("r_comment", "string")]) assert_equal(table.schema(), ex_schema)
def test_create_partitioned_separate_schema(self): schema = ibis.schema([('day', 'int8'), ('value', 'double')]) part_schema = ibis.schema([('year', 'int32'), ('month', 'int8')]) name = _tmp_name() self.con.create_table(name, schema=schema, partition=part_schema) self.temp_tables.append(name) # the partition column get put at the end of the table ex_schema = ibis.schema([('day', 'int8'), ('value', 'double'), ('year', 'int32'), ('month', 'int8')]) table_schema = self.con.get_schema(name) assert_equal(table_schema, ex_schema) partition_schema = self.con.table(name).partition_schema() assert_equal(partition_schema, part_schema)
def test_create_table_with_location(self): path = '/path/to/table' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableWithSchema('another_table', schema, can_exist=False, format='parquet', path=path, database='foo') result = statement.compile() expected = """\ CREATE TABLE foo.`another_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format(path) assert result == expected
def test_load_data_sqlalchemy(backend, con, temp_table): sch = ibis.schema([ ('first_name', 'string'), ('last_name', 'string'), ('department_name', 'string'), ('salary', 'float64'), ]) df = pd.DataFrame({ 'first_name': ['A', 'B', 'C'], 'last_name': ['D', 'E', 'F'], 'department_name': ['AA', 'BB', 'CC'], 'salary': [100.0, 200.0, 300.0], }) con.create_table(temp_table, schema=sch) con.load_data(temp_table, df, if_exists='append') result = con.table(temp_table).execute() backend.assert_frame_equal(df, result)
def batting(self) -> ir.TableExpr: schema = ibis.schema( [ ('lgID', dt.string), ('G', dt.float64), ('AB', dt.float64), ('R', dt.float64), ('H', dt.float64), ('X2B', dt.float64), ('X3B', dt.float64), ('HR', dt.float64), ('RBI', dt.float64), ('SB', dt.float64), ('CS', dt.float64), ('BB', dt.float64), ('SO', dt.float64), ] ) return self.connection.table('batting', schema=schema)
def test_nullable_input_output(con, backend, temp_table): # - Impala, PySpark and Spark non-nullable issues #2138 and #2137 if not hasattr(con, 'create_table') or not hasattr(con, 'drop_table'): pytest.xfail( '{} backend doesn\'t have create_table or drop_table methods.') sch = ibis.schema([ ('foo', 'int64'), ('bar', ibis.expr.datatypes.int64(nullable=False)), ('baz', 'boolean*'), ]) con.create_table(temp_table, schema=sch) t = con.table(temp_table) assert t.schema().types[0].nullable assert not t.schema().types[1].nullable assert t.schema().types[2].nullable
def test_add_drop_partition_no_location(self): schema = ibis.schema([('foo', 'string'), ('year', 'int32'), ('month', 'int16')]) name = _tmp_name() self.db.create_table(name, schema=schema, partition=['year', 'month']) table = self.db.table(name) part = {'year': 2007, 'month': 4} table.add_partition(part) assert len(table.partitions()) == 2 table.drop_partition(part) assert len(table.partitions()) == 1 table.drop()
def test_query_delimited_file_directory(self): hdfs_path = pjoin(self.test_data_dir, 'csv') schema = ibis.schema([('foo', 'string'), ('bar', 'double'), ('baz', 'int8')]) name = 'delimited_table_test1' table = self.con.delimited_file(hdfs_path, schema, name=name, database=self.tmp_db, delimiter=',') try: expr = (table [table.bar > 0] .group_by('foo') .aggregate([table.bar.sum().name('sum(bar)'), table.baz.sum().name('mean(baz)')])) expr.execute() finally: self.con.drop_table(name, database=self.tmp_db)
def test_add_drop_partition(self): pytest.skip('HIVE-12613') schema = ibis.schema([('foo', 'string'), ('year', 'int32'), ('month', 'int16')]) name = _tmp_name() self.db.create_table(name, schema=schema, partition=['year', 'month']) table = self.db.table(name) part = {'year': 2007, 'month': 4} path = '/tmp/tmp-{0}'.format(util.guid()) table.add_partition(part, location=path) assert len(table.partitions()) == 2 table.drop_partition(part) assert len(table.partitions()) == 1
def test_add_drop_partition_hive_bug(con, temp_table): schema = ibis.schema([('foo', 'string'), ('year', 'int32'), ('month', 'int16')]) name = temp_table con.create_table(name, schema=schema, partition=['year', 'month']) table = con.table(name) part = {'year': 2007, 'month': 4} path = '/tmp/{}'.format(util.guid()) table.add_partition(part, location=path) assert len(table.partitions()) == 2 table.drop_partition(part) assert len(table.partitions()) == 1
def test_query_parquet_file_with_schema(con, test_data_dir): hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') schema = ibis.schema([ ('r_regionkey', 'int16'), ('r_name', 'string'), ('r_comment', 'string'), ]) table = con.parquet_file(hdfs_path, schema=schema) name = table.op().name # table exists con.table(name) expr = table.r_name.value_counts() expr.execute() assert table.count().execute() == 5
def test_persist_parquet_file_with_name(con, test_data_dir, temp_table_db): import gc hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') tmp_db, name = temp_table_db schema = ibis.schema([ ('r_regionkey', 'int16'), ('r_name', 'string'), ('r_comment', 'string'), ]) con.parquet_file(hdfs_path, schema=schema, name=name, database=tmp_db, persist=True) gc.collect() # table still exists con.table(name, database=tmp_db)
def test_mutation_fusion_no_overwrite(): """Test fusion with chained mutation that doesn't overwrite existing columns. """ t = ibis.table(ibis.schema([('col', 'int32')]), 't') result = t result = result.mutate(col1=t['col'] + 1) result = result.mutate(col2=t['col'] + 2) result = result.mutate(col3=t['col'] + 3) first_selection = result assert len(result.op().selections) == 4 assert (first_selection.op().selections[1].equals( (t['col'] + 1).name('col1'))) assert (first_selection.op().selections[2].equals( (t['col'] + 2).name('col2'))) assert (first_selection.op().selections[3].equals( (t['col'] + 3).name('col3')))
def create_table(self, name, expr=None, schema=None, database=None): if database is not None and database != self.engine.url.database: raise NotImplementedError( 'Creating tables from a different database is not yet ' 'implemented') if expr is None and schema is None: raise ValueError('You must pass either an expression or a schema') if expr is not None and schema is not None: if not expr.schema().equals(ibis.schema(schema)): raise TypeError( 'Expression schema is not equal to passed schema. ' 'Try passing the expression without the schema') t = table_from_schema(name, self.meta, schema or expr.schema()) with self.con.begin() as bind: t.create(bind=bind) if expr is not None: bind.execute(t.insert().from_select(list(expr.columns), expr.compile()))
def impala_create_test_database(con, env): con.drop_database(env.test_data_db, force=True) con.create_database(env.test_data_db) con.create_table( 'alltypes', schema=ibis.schema( [ ('a', 'int8'), ('b', 'int16'), ('c', 'int32'), ('d', 'int64'), ('e', 'float'), ('f', 'double'), ('g', 'string'), ('h', 'boolean'), ('i', 'timestamp'), ] ), database=env.test_data_db, )
def test_create_table_schema(con, temp_table, properties): schema = ibis.schema([ ('a', 'float'), ('b', 'double'), ('c', 'int8'), ('d', 'int16'), ('e', 'int32'), ('f', 'int64'), ('x', 'point'), ('y', 'linestring'), ('z', 'polygon'), ('w', 'multipolygon'), ]) con.create_table(temp_table, schema=schema, **properties) t = con.table(temp_table) for k, i_type in t.schema().items(): assert schema[k] == i_type
def get_type(expr): try: return str(expr.type()) except (AttributeError, NotImplementedError): pass try: schema = expr.schema() except (AttributeError, NotImplementedError): try: # As a last resort try get the name of the output_type class return expr.op().output_type().__name__ except (AttributeError, NotImplementedError): return '\u2205' # empty set character except com.IbisError: op = expr.op() assert isinstance(op, ops.Join) left_table_name = getattr(op.left.op(), 'name', None) or ops.genname() left_schema = op.left.schema() right_table_name = ( getattr(op.right.op(), 'name', None) or ops.genname() ) right_schema = op.right.schema() pairs = [ ('{}.{}'.format(left_table_name, left_column), type) for left_column, type in left_schema.items() ] + [ ('{}.{}'.format(right_table_name, right_column), type) for right_column, type in right_schema.items() ] schema = ibis.schema(pairs) return ( ''.join( '<BR ALIGN="LEFT" /> <I>{}</I>: {}'.format( escape(name), escape(str(type)) ) for name, type in zip(schema.names, schema.types) ) + '<BR ALIGN="LEFT" />' )
def test_load_data_partitioned(self): path = '/path/to/data' part = {'year': 2007, 'month': 7} part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')]) stmt = ddl.LoadData('functional_alltypes', path, database='foo', partition=part, partition_schema=part_schema) result = stmt.compile() expected = """\ LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes` PARTITION (year=2007, month=7)""" assert result == expected stmt.overwrite = True result = stmt.compile() expected = """\ LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes` PARTITION (year=2007, month=7)""" assert result == expected
def test_convert_parquet(parquet_schema): strings = [dt.string, dt.string, dt.string] # uint32, int8, int16 stored as upcasted types types = ([ dt.uint8, dt.uint16, dt.int64, dt.uint64, dt.int16, dt.int16, dt.int32, dt.int64, dt.float32, dt.float64, dt.boolean, dt.timestamp, ] + strings + [dt.binary, dt.int64]) names = [ 'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64', 'bool', 'datetime', 'str', 'str_with_nulls', 'empty_str', 'bytes', ] expected = ibis.schema(zip(names, types)) result = ibis.infer_schema(parquet_schema) assert result == expected
def create_test_database(con): if con.exists_database(ENV.test_data_db): con.drop_database(ENV.test_data_db, force=True) con.create_database(ENV.test_data_db) logger.info('Created database %s', ENV.test_data_db) con.create_table( 'alltypes', schema=ibis.schema([ ('a', 'int8'), ('b', 'int16'), ('c', 'int32'), ('d', 'int64'), ('e', 'float'), ('f', 'double'), ('g', 'string'), ('h', 'boolean'), ('i', 'timestamp'), ]), database=ENV.test_data_db, ) logger.info('Created empty table %s.`alltypes`', ENV.test_data_db)
def test_query_delimited_file_directory(con, test_data_dir, tmp_db): hdfs_path = pjoin(test_data_dir, 'csv') schema = ibis.schema( [('foo', 'string'), ('bar', 'double'), ('baz', 'int8')] ) name = 'delimited_table_test1' table = con.delimited_file( hdfs_path, schema, name=name, database=tmp_db, delimiter=',' ) expr = ( table[table.bar > 0] .group_by('foo') .aggregate( [ table.bar.sum().name('sum(bar)'), table.baz.sum().name('mean(baz)'), ] ) ) assert expr.execute() is not None
def test_load_data_sqlalchemy(backend, con, temp_table): if not isinstance(con.dialect(), ibis.sql.alchemy.AlchemyDialect): pytest.skip('{} is not a SQL Alchemy Client.'.format(backend.name)) sch = ibis.schema([ ('first_name', 'string'), ('last_name', 'string'), ('department_name', 'string'), ('salary', 'float64'), ]) df = pd.DataFrame({ 'first_name': ['A', 'B', 'C'], 'last_name': ['D', 'E', 'F'], 'department_name': ['AA', 'BB', 'CC'], 'salary': [100.0, 200.0, 300.0], }) con.create_table(temp_table, schema=sch) con.load_data(temp_table, df, if_exists='append') result = con.table(temp_table).execute() backend.assert_frame_equal(df, result)
def test_create_external_ddl(self): schema = ibis.schema([('key1', 'int32'), ('key2', 'int64'), ('value1', 'double')]) stmt = ksupport.CreateTableKudu( 'impala_name', 'kudu_name', ['master1.d.com:7051', 'master2.d.com:7051'], schema, ['key1', 'key2']) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE `impala_name` (`key1` int, `key2` bigint, `value1` double) TBLPROPERTIES ( 'kudu.key_columns'='key1, key2', 'kudu.master_addresses'='master1.d.com:7051, master2.d.com:7051', 'kudu.table_name'='kudu_name', 'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler' )""" assert result == expected
def test_convert_parquet(parquet_schema): # TODO(jreback) # not entirely sure this is correct # should these be strings in py2? if PY2: strings = [dt.binary, dt.binary, dt.binary] else: strings = [dt.string, dt.string, dt.string] # uint32, int8, int16 stored as upcasted types types = [ dt.uint8, dt.uint16, dt.int64, dt.uint64, dt.int16, dt.int16, dt.int32, dt.int64, dt.float32, dt.float64, dt.boolean, dt.timestamp ] + strings + [dt.binary, dt.int64] names = [ 'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64', 'bool', 'datetime', 'str', 'str_with_nulls', 'empty_str', 'bytes', '__index_level_0__' ] expected = ibis.schema(zip(names, types)) result = ibis.infer_schema(parquet_schema) assert result == expected
def test_create_table_parquet_with_schema(self): directory = '/path/to/' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableParquet('new_table', directory, schema=schema, external=True, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format(directory) assert result == expected
def test_create_table_delimited(): path = '/path/to/files/' schema = ibis.schema( [ ('a', 'string'), ('b', 'int32'), ('c', 'double'), ('d', 'decimal(12, 2)'), ] ) stmt = ddl.CreateTableDelimited( 'new_table', path, schema, delimiter='|', escapechar='\\', lineterminator='\0', database='foo', can_exist=True, ) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`a` string, `b` int, `c` double, `d` decimal(12, 2)) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\' LINES TERMINATED BY '\0' LOCATION '{0}'""".format( path ) assert result == expected
def test_sa_default_numeric_precision_and_scale(con, backend, dialects, default_precisions, default_scales): # TODO: find a better way to access ibis.sql.alchemy import ibis.sql.alchemy as alch dialect = dialects[backend.name] default_precision = default_precisions[backend.name] default_scale = default_scales[backend.name] typespec = [ # name, sqlalchemy type, ibis type ('n1', dialect.NUMERIC, dt.Decimal(default_precision, default_scale)), ('n2', dialect.NUMERIC(5), dt.Decimal(5, default_scale)), ('n3', dialect.NUMERIC(None, 4), dt.Decimal(default_precision, 4)), ('n4', dialect.NUMERIC(10, 2), dt.Decimal(10, 2)), ] sqla_types = [] ibis_types = [] for name, t, ibis_type in typespec: sqla_type = sa.Column(name, t, nullable=True) sqla_types.append(sqla_type) ibis_types.append((name, ibis_type(nullable=True))) # Create a table with the numeric types. table_name = 'test_sa_default_param_decimal' engine = con.con table = sa.Table(table_name, sa.MetaData(bind=engine), *sqla_types) # Check that we can correctly recover the default precision and scale. schema = alch.schema_from_table(table) expected = ibis.schema(ibis_types) assert_equal(schema, expected) con.drop_table(table_name, force=True)
def test_add_drop_partition_owned_by_impala(hdfs, con, temp_table): schema = ibis.schema([('foo', 'string'), ('year', 'int32'), ('month', 'int16')]) name = temp_table con.create_table(name, schema=schema, partition=['year', 'month']) table = con.table(name) part = {'year': 2007, 'month': 4} subdir = util.guid() basename = util.guid() path = f'/tmp/{subdir}/{basename}' hdfs.mkdir(f'/tmp/{subdir}') hdfs.chown(f'/tmp/{subdir}', owner='impala', group='supergroup') table.add_partition(part, location=path) assert len(table.partitions()) == 2 table.drop_partition(part) assert len(table.partitions()) == 1
def test_create_table_schema(con): t_name = 'mytable' con.drop_table(t_name, force=True) schema = ibis.schema([('a', 'float'), ('b', 'double'), ('c', 'int32'), ('d', 'int64'), ('x', 'point'), ('y', 'linestring'), ('z', 'polygon'), ('w', 'multipolygon')]) con.create_table(t_name, schema=schema) try: t = con.table(t_name) assert isinstance(t.a, ir.FloatingColumn) assert isinstance(t.b, ir.FloatingColumn) assert isinstance(t.c, ir.IntegerColumn) assert isinstance(t.d, ir.IntegerColumn) assert isinstance(t.x, ir.PointColumn) assert isinstance(t.y, ir.LineStringColumn) assert isinstance(t.z, ir.PolygonColumn) assert isinstance(t.w, ir.MultiPolygonColumn) finally: con.drop_table(t_name)
def test_read_csv(con, temp_table, filename): schema = ibis.schema( [ ('index', 'int64'), ('Unnamed__0', 'int64'), ('id', 'int32'), ('bool_col', 'bool'), ('tinyint_col', 'int16'), ('smallint_col', 'int16'), ('int_col', 'int32'), ('bigint_col', 'int64'), ('float_col', 'float32'), ('double_col', 'double'), ('date_string_col', 'string'), ('string_col', 'string'), ('timestamp_col', 'timestamp'), ('year_', 'int32'), ('month_', 'int32'), ] ) con.create_table(temp_table, schema=schema) # prepare csv file inside omnisci docker container # if the file exists, then it will be overwritten con._execute( "COPY (SELECT * FROM functional_alltypes) TO '{}'".format(filename) ) db = con.database() table = db.table(temp_table) table.read_csv(filename, header=False, quotechar='"', delimiter=",") df_read_csv = table.execute() df_expected = db.table("functional_alltypes").execute() pd.testing.assert_frame_equal(df_expected, df_read_csv)
def test_sqla_schema_conversion(con): typespec = [ # name, type, nullable ('smallint', sat.SmallInteger, False, dt.int16), ('int', sat.Integer, True, dt.int32), ('integer', sat.INTEGER(), True, dt.int32), ('bigint', sat.BigInteger, False, dt.int64), ('real', sat.REAL, True, dt.float32), ('bool', sat.Boolean, True, dt.bool), ('timestamp', sat.DateTime, True, dt.timestamp), ] sqla_types = [] ibis_types = [] for name, t, nullable, ibis_type in typespec: sqla_types.append(sa.Column(name, t, nullable=nullable)) ibis_types.append((name, ibis_type(nullable=nullable))) table = sa.Table('tname', con.meta, *sqla_types) schema = schema_from_table(table) expected = ibis.schema(ibis_types) assert_equal(schema, expected)
def test_query_schema(backend, con, alltypes, expr_fn, expected): if not hasattr(con, '_build_ast'): pytest.skip( '{} backend has no _build_ast method'.format( type(backend).__name__ ) ) expr = expr_fn(alltypes) # we might need a public API for it ast = con._build_ast(expr, backend.make_context()) query = con.query_class(con, ast) schema = query.schema() # clickhouse columns has been defined as non-nullable # whereas other backends don't support non-nullable columns yet expected = ibis.schema( [ (name, dtype(nullable=schema[name].nullable)) for name, dtype in expected ] ) assert query.schema().equals(expected)