def test_put_get_directory(self): local_dir = util.guid() local_download_dir = util.guid() K = 5 os.mkdir(local_dir) try: for i in range(K): self._make_random_file(directory=local_dir) remote_dir = pjoin(self.tmp_dir, local_dir) self.hdfs.put(remote_dir, local_dir) assert self.hdfs.exists(remote_dir) assert len(self.hdfs.ls(remote_dir)) == K # download directory and check contents self.hdfs.get(remote_dir, local_download_dir) _check_directories_equal(local_dir, local_download_dir) self._try_delete_directory(local_download_dir) self.hdfs.rmdir(remote_dir) assert not self.hdfs.exists(remote_dir) finally: shutil.rmtree(local_dir)
def pandas(self, df, name=None, database=None, persist=False): """ Create a (possibly temp) parquet table from a local pandas DataFrame. """ name, database = self._get_concrete_table_path(name, database, persist=persist) qualified_name = self._fully_qualified_name(name, database) # write df to a temp CSV file on HDFS temp_csv_hdfs_dir = pjoin(options.impala.temp_hdfs_path, util.guid()) buf = BytesIO() df.to_csv(buf, header=False, index=False, na_rep='\N') self.hdfs.put(pjoin(temp_csv_hdfs_dir, '0.csv'), buf) # define a temporary table using delimited data schema = util.pandas_to_ibis_schema(df) table = self.delimited_file( temp_csv_hdfs_dir, schema, name='ibis_tmp_pandas_{0}'.format(util.guid()), database=database, external=True, persist=False) # CTAS into Parquet self.create_table(name, expr=table, database=database, format='parquet', overwrite=False) # cleanup self.hdfs.delete(temp_csv_hdfs_dir, recursive=True) return self._wrap_new_table(qualified_name, persist)
def test_get_directory_nested_dirs(self): local_dir = util.guid() local_download_dir = util.guid() K = 5 os.mkdir(local_dir) try: for i in xrange(K): self._make_random_file(directory=local_dir) nested_dir = osp.join(local_dir, 'nested-dir') shutil.copytree(local_dir, nested_dir) remote_dir = pjoin(self.tmp_dir, local_dir) self.hdfs.put(remote_dir, local_dir) # download directory and check contents self.hdfs.get(remote_dir, local_download_dir) _check_directories_equal(local_dir, local_download_dir) self._try_delete_directory(local_download_dir) self.hdfs.rmdir(remote_dir) assert not self.hdfs.exists(remote_dir) finally: shutil.rmtree(local_dir)
def test_add_drop_partition_owned_by_impala(hdfs, con, temp_table): schema = ibis.schema( [('foo', 'string'), ('year', 'int32'), ('month', 'int16')] ) name = temp_table con.create_table(name, schema=schema, partition=['year', 'month']) table = con.table(name) part = {'year': 2007, 'month': 4} subdir = util.guid() basename = util.guid() path = '/tmp/{}/{}'.format(subdir, basename) hdfs.mkdir('/tmp/{}'.format(subdir)) hdfs.chown('/tmp/{}'.format(subdir), owner='impala', group='supergroup') table.add_partition(part, location=path) assert len(table.partitions()) == 2 table.drop_partition(part) assert len(table.partitions()) == 1
def test_create_database_with_location(self): base = pjoin(self.tmp_dir, util.guid()) name = '__ibis_test_{0}'.format(util.guid()) tmp_path = pjoin(base, name) self.con.create_database(name, path=tmp_path) assert self.hdfs.exists(base) self.con.drop_database(name) self.hdfs.rmdir(base)
def create(create_tarball, push_to_s3): """Create Ibis test data""" print(str(ENV)) con = make_ibis_client() # verify some assumptions before proceeding if push_to_s3 and not create_tarball: raise IbisError( "Must specify --create-tarball if specifying --push-to-s3") if osp.exists(IBIS_TEST_DATA_LOCAL_DIR): raise IbisError( 'Local dir {0} already exists; please remove it first'.format( IBIS_TEST_DATA_LOCAL_DIR)) if not con.exists_database('tpch'): raise IbisError('`tpch` database does not exist') if not con.hdfs.exists('/test-warehouse/tpch.region_avro'): raise IbisError( 'HDFS dir /test-warehouse/tpch.region_avro does not exist') # generate tmp identifiers tmp_db_hdfs_path = pjoin(ENV.tmp_dir, guid()) tmp_db = guid() os.mkdir(IBIS_TEST_DATA_LOCAL_DIR) try: # create the tmp data locally con.create_database(tmp_db, path=tmp_db_hdfs_path) print('Created database {0} at {1}'.format(tmp_db, tmp_db_hdfs_path)) # create the local data set scrape_parquet_files(tmp_db, con) download_parquet_files(con, tmp_db_hdfs_path) download_avro_files(con) generate_csv_files() # Only populate SQLite here engines = [get_sqlite_engine()] load_sql_databases(con, engines) finally: con.drop_database(tmp_db, force=True) assert not con.hdfs.exists(tmp_db_hdfs_path) if create_tarball: check_call('tar -zc {0} > {1}' .format(IBIS_TEST_DATA_LOCAL_DIR, TARBALL_NAME), shell=True) if push_to_s3: import boto s3_conn = boto.connect_s3(IBIS_TEST_AWS_KEY_ID, IBIS_TEST_AWS_SECRET) bucket = s3_conn.get_bucket(IBIS_TEST_DATA_S3_BUCKET) # copy_tarball_to_versioned_backup(bucket) key = bucket.new_key(IBIS_TEST_DATA_TARBALL) print('Upload tarball to S3') key.set_contents_from_filename(TARBALL_NAME, replace=True)
def test_create_table_with_location(self): base = pjoin(self.tmp_dir, util.guid()) name = "test_{0}".format(util.guid()) tmp_path = pjoin(base, name) expr = self.alltypes table_name = _random_table_name() self.con.create_table(table_name, expr=expr, path=tmp_path, database=self.test_data_db) self.temp_tables.append(".".join([self.test_data_db, table_name])) assert self.hdfs.exists(tmp_path)
def _create_777_tmp_dir(cls): base = pjoin(cls.tmp_dir, util.guid()) tmp_path = pjoin(base, util.guid()) env = IbisTestEnv() superuser_hdfs = ibis.hdfs_connect(host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=(env.auth_mechanism not in ['GSSAPI', 'LDAP']), user=env.hdfs_superuser) superuser_hdfs.mkdir(base) superuser_hdfs.chmod(base, '777') return tmp_path
def test_create_database_with_location(con, tmp_dir, hdfs): base = pjoin(tmp_dir, util.guid()) name = '__ibis_test_{}'.format(util.guid()) tmp_path = pjoin(base, name) con.create_database(name, path=tmp_path) try: assert hdfs.exists(base) finally: try: con.drop_database(name) finally: hdfs.rmdir(base)
def _make_random_file(self, size=1024, directory=None): path = util.guid() if directory: path = osp.join(directory, path) units = size / 32 with open(path, 'wb') as f: for i in xrange(units): f.write(util.guid()) self.test_files.append(path) return path
def test_drop_non_empty_database(self): tmp_db = "__ibis_test_{0}".format(util.guid()) self.con.create_database(tmp_db) self.con.create_table(util.guid(), self.alltypes, database=tmp_db) # Has a view, too self.con.create_view(util.guid(), self.alltypes, database=tmp_db) self.assertRaises(com.IntegrityError, self.con.drop_database, tmp_db) self.con.drop_database(tmp_db, force=True) assert not self.con.exists_database(tmp_db)
def test_create_table_with_location_execute( con, hdfs, tmp_dir, alltypes, test_data_db, temp_table ): base = pjoin(tmp_dir, util.guid()) name = 'test_{}'.format(util.guid()) tmp_path = pjoin(base, name) expr = alltypes table_name = temp_table con.create_table( table_name, obj=expr, location=tmp_path, database=test_data_db ) assert hdfs.exists(tmp_path)
def test_put_buffer_like(self): data = b'peekaboo' buf = BytesIO() buf.write(data) buf.seek(0) remote_path = pjoin(self.tmp_dir, util.guid()) self.hdfs.put(remote_path, buf) local_path = util.guid() self.test_files.append(local_path) self.hdfs.get(remote_path, local_path) assert open(local_path, 'rb').read() == data
def load_sql_databases(con, engines): csv_path = guid() generate_sql_csv_sources(csv_path, con.database('ibis_testing')) for engine in engines: make_testing_db(csv_path, engine) shutil.rmtree(csv_path)
def identity_func_testing( udf_ll, udfcon, test_data_db, datatype, literal, column ): inputs = [datatype] name = '__tmp_udf_' + util.guid() func = udf_creation_to_op( udf_ll, udfcon, test_data_db, name, 'Identity', inputs, datatype ) expr = func(literal) assert issubclass(type(expr), ir.ScalarExpr) result = udfcon.execute(expr) # Hacky if datatype == 'timestamp': assert type(result) == pd.Timestamp else: lop = literal.op() if isinstance(lop, ir.Literal): np.testing.assert_allclose(lop.value, 5) else: np.testing.assert_allclose(result, udfcon.execute(literal), 5) expr = func(column) assert issubclass(type(expr), ir.ColumnExpr) udfcon.execute(expr)
def _new_kudu_example_table(self, kschema): kudu_name = 'ibis-tmp-{0}'.format(util.guid()) self.kclient.create_table(kudu_name, kschema) self.temp_tables.append(kudu_name) return kudu_name
def _make_update_task(self, uda_class, cols, prior_state=None): # Overall layout here: # - task name # - serialized agg class # - prior state flag 1/0 # - (optional) serialized prior state # - serialized table fragment payload = BytesIO() msg_writer = wire.PackedMessageWriter(payload) msg_writer.string('agg-update') msg_writer.string(pickle_dump(uda_class)) if prior_state is not None: msg_writer.uint8(1) msg_writer.string(pickle_dump(prior_state)) else: msg_writer.uint8(0) writer = IbisTableWriter(cols) # Create memory map of the appropriate size path = 'task_%s' % guid() size = writer.total_size() + payload.tell() offset = 0 mm = SharedMmap(path, size, create=True) self.paths_to_delete.append(path) mm.write(payload.getvalue()) writer.write(mm) task = IbisTaskMessage(self.lock.semaphore_id, path, offset, size) return task, mm
def test_insert_select_partitioned_table(self): df = self.df unpart_t = self.db.table(self.pd_name) part_name = util.guid() part_keys = ['year', 'month'] self.db.create_table(part_name, schema=unpart_t.schema(), partition=part_keys) part_t = self.db.table(part_name) unique_keys = df[part_keys].drop_duplicates() for i, (year, month) in enumerate(unique_keys.itertuples(index=False)): select_stmt = unpart_t[(unpart_t.year == year) & (unpart_t.month == month)] # test both styles of insert if i: part = {'year': year, 'month': month} else: part = [year, month] part_t.insert(select_stmt, partition=part) result = (part_t.execute() .sort_index(by='id') .reset_index(drop=True) [df.columns]) assert_frame_equal(result, df)
def mkdir(self, dir_path, create_parent=False): # ugh, see #252 # create a temporary file, then delete it dummy = pjoin(dir_path, util.guid()) self.client.write(dummy, '') self.client.delete(dummy)
def write_csv(self, path): import csv tmp_path = 'tmp_{0}.csv'.format(util.guid()) f = open(tmp_path, 'w+') try: # Write the DataFrame to the temporary file path if options.verbose: log('Writing DataFrame to temporary file') self.df.to_csv(f, header=False, index=False, sep=',', quoting=csv.QUOTE_NONE, escapechar='\\', na_rep='#NULL') f.seek(0) if options.verbose: log('Writing CSV to: {0}'.format(path)) self.hdfs.put(path, f) finally: f.close() try: os.remove(tmp_path) except os.error: pass return path
def __init__(self): # TODO: allow initializing values through a constructor self.impala_host = os.environ.get('IBIS_TEST_IMPALA_HOST', 'localhost') self.impala_protocol = os.environ.get('IBIS_TEST_IMPALA_PROTOCOL', 'hiveserver2') self.impala_port = int(os.environ.get('IBIS_TEST_IMPALA_PORT', 21050)) self.tmp_db = os.environ.get('IBIS_TEST_TMP_DB', '__ibis_tmp_{0}'.format(util.guid())) self.tmp_dir = os.environ.get('IBIS_TEST_TMP_HDFS_DIR', '/tmp/__ibis_test') self.test_data_db = os.environ.get('IBIS_TEST_DATA_DB', 'ibis_testing') self.test_data_dir = os.environ.get('IBIS_TEST_DATA_HDFS_DIR', '/__ibis/ibis-testing-data') self.nn_host = os.environ.get('IBIS_TEST_NN_HOST', 'localhost') # 5070 is default for impala dev env self.webhdfs_port = int(os.environ.get('IBIS_TEST_WEBHDFS_PORT', 5070)) self.hdfs_superuser = os.environ.get('IBIS_TEST_HDFS_SUPERUSER', 'hdfs') self.use_codegen = os.environ.get('IBIS_TEST_USE_CODEGEN', 'False').lower() == 'true' self.cleanup_test_data = os.environ.get('IBIS_TEST_CLEANUP_TEST_DATA', 'True').lower() == 'true' self.use_kerberos = os.environ.get('IBIS_TEST_USE_KERBEROS', 'False').lower() == 'true' # update global Ibis config where relevant options.impala.temp_db = self.tmp_db options.impala.temp_hdfs_path = self.tmp_dir
def to_operation(self, name=None): """ Creates and returns an operator class that can be passed to add_operation() Parameters ---------- name : string (optional). Used internally to track function Returns ------- op : an operator class to use in constructing function """ (in_values, out_value) = _operation_type_conversion(self.inputs, self.output) class_name = name if self.name and not name: class_name = self.name elif not (name or self.name): class_name = 'UDF_{0}'.format(util.guid()) func_dict = { 'input_type': in_values, 'output_type': out_value, } UdfOp = type(class_name, (_ops.ValueOp,), func_dict) return UdfOp
def _visit_select_Histogram(self, expr): op = expr.op() EPS = 1e-13 if op.binwidth is None or op.base is None: aux_hash = op.aux_hash or util.guid()[:6] min_name = 'min_%s' % aux_hash max_name = 'max_%s' % aux_hash minmax = self.table_set.aggregate([op.arg.min().name(min_name), op.arg.max().name(max_name)]) self.table_set = self.table_set.cross_join(minmax) if op.base is None: base = minmax[min_name] - EPS else: base = op.base binwidth = (minmax[max_name] - base) / (op.nbins - 1) else: # Have both a bin width and a base binwidth = op.binwidth base = op.base bucket = (op.arg - base) / binwidth return bucket.floor().name(expr._name)
def test_create_table_as_select_ctas(self): # TODO kschema = self.example_schema() kudu_name = self._new_kudu_example_table(kschema) nrows = 100 self._write_example_data(kudu_name, nrows) impala_name = self._temp_impala_name() impala_db = self.env.test_data_db self.con.kudu.table(kudu_name, name=impala_name, database=impala_db, external=True, persist=True) impala_name2 = self._temp_impala_name() expr = self.con.table(impala_name, database=impala_db) kudu_name2 = 'ibis-ctas-{0}'.format(util.guid()) self.con.kudu.create_table(impala_name2, kudu_name2, primary_keys=['key'], obj=expr, database=impala_db) # TODO: should some stats be automatically computed? itable = self.con.table(impala_name2, database=impala_db) assert len(itable.execute()) == len(expr.execute()) ktable = self.kclient.table(kudu_name2) assert ktable.schema.primary_keys() == ['key']
def _identity_func_testing(self, datatype, literal, column): inputs = [datatype] name = "__tmp_udf_" + util.guid() op = self._udf_creation_to_op(name, "Identity", inputs, datatype) def _identity_test(value): return op(value).to_expr() expr = _identity_test(literal) assert issubclass(type(expr), ir.ScalarExpr) result = self.con.execute(expr) # Hacky if datatype is "timestamp": import pandas as pd assert type(result) == pd.tslib.Timestamp else: lop = literal.op() if isinstance(lop, ir.Literal): self.assertAlmostEqual(result, lop.value, 5) else: self.assertAlmostEqual(result, self.con.execute(literal), 5) expr = _identity_test(column) assert issubclass(type(expr), ir.ArrayExpr) self.con.execute(expr)
def test_database_drop(self): tmp_name = '__ibis_test_{0}'.format(util.guid()) self.con.create_database(tmp_name) db = self.con.database(tmp_name) self.temp_databases.append(tmp_name) db.drop() assert not self.con.exists_database(tmp_name)
def test_mv_to_directory(self): remote_file = self._make_random_hdfs_file() dest_dir = pjoin(self.tmp_dir, util.guid()) self.hdfs.mkdir(dest_dir) self.hdfs.mv(remote_file, dest_dir) new_remote_file = pjoin(dest_dir, os.path.basename(remote_file)) file_status = self.hdfs.status(new_remote_file) assert file_status['type'] == 'FILE'
def test_file_interface(self): path = guid() self.to_nuke.append(path) data = guid() mm = SharedMmap(path, len(data), create=True) assert mm.tell() == 0 mm.write(data) assert mm.tell() == len(data) mm.seek(0) assert mm.tell() == 0 result = mm.read(16) assert len(result) == 16 assert result == data[:16] assert mm.tell() == 16
def temp_parquet_table2(con, tmp_db, temp_parquet_table_schema): name = util.guid() db = con.database(tmp_db) db.create_table(name, schema=temp_parquet_table_schema, format='parquet') try: yield db[name] finally: db.client.drop_table(name, database=tmp_db)
def setUpClass(cls): ImpalaE2E.setup_e2e(cls) cls.path_uuid = 'change-location-{0}'.format(util.guid()) fake_path = pjoin(cls.tmp_dir, cls.path_uuid) cls.table_name = 'table_{0}'.format(util.guid()) schema = ibis.schema([('foo', 'string'), ('bar', 'int64')]) cls.con.create_table(cls.table_name, database=cls.tmp_db, schema=schema, format='parquet', external=True, location=fake_path) cls.table = cls.con.table(cls.table_name, database=cls.tmp_db)
def _tmp_name(): return 'tmp_partition_{0}'.format(util.guid())
def test_chmod_directory(self): new_permissions = '755' path = pjoin(self.tmp_dir, util.guid()) self.hdfs.mkdir(path) self.hdfs.chmod(path, new_permissions) assert self.hdfs.status(path)['permission'] == new_permissions
def created_view(con, alltypes): name = util.guid() expr = alltypes.limit(10) con.create_view(name, expr) return name
def test_chown_group_directory(self): new_group = 'randomgroup' path = pjoin(self.tmp_dir, util.guid()) self.hdfs.mkdir(path) self.hdfs.chown(path, group=new_group) assert self.hdfs.status(path)['group'] == new_group
class IbisTestEnv: def items(self): return [ (name, getattr(self, name)) for name, _ in inspect.getmembers(type(self), predicate=isproperty) ] def __repr__(self): lines = map('{}={!r},'.format, *zip(*self.items())) return '{}(\n{}\n)'.format( type(self).__name__, util.indent('\n'.join(lines), 4) ) @property def impala_host(self): return os.environ.get('IBIS_TEST_IMPALA_HOST', 'localhost') @property def impala_port(self): return int(os.environ.get('IBIS_TEST_IMPALA_PORT', 21050)) @property def tmp_db(self): options.impala.temp_db = tmp_db = os.environ.get( 'IBIS_TEST_TMP_DB', 'ibis_testing_tmp_db' ) return tmp_db options.impala.temp_hdfs_path = tmp_dir = os.environ.get( 'IBIS_TEST_TMP_HDFS_DIR', '/tmp/__ibis_test_{}'.format(util.guid()) ) @property def test_data_db(self): return os.environ.get('IBIS_TEST_DATA_DB', 'ibis_testing') @property def test_data_dir(self): return os.environ.get( 'IBIS_TEST_DATA_HDFS_DIR', '/__ibis/ibis-testing-data' ) @property def nn_host(self): return os.environ.get('IBIS_TEST_NN_HOST', 'localhost') @property def webhdfs_port(self): # 5070 is default for impala dev env return int(os.environ.get('IBIS_TEST_WEBHDFS_PORT', 50070)) @property def hdfs_superuser(self): return os.environ.get('IBIS_TEST_HDFS_SUPERUSER', 'hdfs') @property def use_codegen(self): return ( os.environ.get('IBIS_TEST_USE_CODEGEN', 'False').lower() == 'true' ) @property def auth_mechanism(self): return os.environ.get('IBIS_TEST_AUTH_MECH', 'NOSASL') @property def webhdfs_user(self): return os.environ.get('IBIS_TEST_WEBHDFS_USER', 'hdfs')
def _random_table_name(): table_name = '__ibis_test_' + util.guid() return table_name
def _random_identifier(suffix): return '__ibis_test_{}_{}'.format(suffix, util.guid())
def test_exists_table(con): assert con.exists_table('functional_alltypes') assert not con.exists_table('foobarbaz_{}'.format(util.guid()))
def guidbytes(): return util.guid().encode('utf8')
def test_chown_owner_directory(self): new_owner = 'randomowner' path = pjoin(self.tmp_dir, util.guid()) self.hdfs.mkdir(path) self.hdfs.chown(path, new_owner) assert self.hdfs.status(path)['owner'] == new_owner
def guidbytes(): if not compat.PY2: return util.guid().encode('utf8') else: return util.guid()
def test_drop_udf_not_exists(self): random_name = util.guid() self.assertRaises(Exception, self.con.drop_udf, random_name)
def test_drop_uda_not_exists(udfcon): random_name = util.guid() with pytest.raises(Exception): udfcon.drop_uda(random_name)
def wrapped_count_uda(uda_so): name = 'user_count_{0}'.format(util.guid()) return api.wrap_uda(uda_so, ['int32'], 'int64', 'CountUpdate', name=name)
def test_drop_table_not_exist(self): random_name = util.guid() self.assertRaises(Exception, self.con.drop_table, random_name) self.con.drop_table(random_name, force=True)
def _temp_impala_name(self): return 'kudu_test_{0}'.format(util.guid())
def test_drop_table_not_exist(con): non_existent_table = 'ibis_table_{}'.format(util.guid()) with pytest.raises(Exception): con.drop_table(non_existent_table) con.drop_table(non_existent_table, force=True)
def _get_class_name(self, name): if name is None: name = util.guid() return 'UDA_{0}'.format(name)
def path_uuid(): return 'change-location-{0}'.format(util.guid())
def test_exists_table(self): assert self.con.exists_table('functional_alltypes') assert not self.con.exists_table(util.guid())
def __init__(self, name=None, lib_path=None): self.lib_path = lib_path self.name = name or util.guid() if lib_path is not None: self._check_library()
def guidbytes(): guid = util.guid() return guid if compat.PY2 else guid.encode('utf8')
def created_view(client, alltypes): name = util.guid() expr = alltypes.limit(10) client.create_view(name, expr, temporary=True) return name