def pandas(self, df, name=None, database=None, persist=False): """ Create a (possibly temp) parquet table from a local pandas DataFrame. """ name, database = self._get_concrete_table_path(name, database, persist=persist) qualified_name = self._fully_qualified_name(name, database) # write df to a temp CSV file on HDFS temp_csv_hdfs_dir = pjoin(options.impala.temp_hdfs_path, util.guid()) buf = BytesIO() df.to_csv(buf, header=False, index=False, na_rep='\N') self.hdfs.put(pjoin(temp_csv_hdfs_dir, '0.csv'), buf) # define a temporary table using delimited data schema = util.pandas_to_ibis_schema(df) table = self.delimited_file( temp_csv_hdfs_dir, schema, name='ibis_tmp_pandas_{0}'.format(util.guid()), database=database, external=True, persist=False) # CTAS into Parquet self.create_table(name, expr=table, database=database, format='parquet', overwrite=False) # cleanup self.hdfs.delete(temp_csv_hdfs_dir, recursive=True) return self._wrap_new_table(qualified_name, persist)
def put(self, hdfs_path, resource, overwrite=False, verbose=None, **kwargs): verbose = verbose or options.verbose is_path = isinstance(resource, six.string_types) if is_path and osp.isdir(resource): for dirpath, dirnames, filenames in os.walk(resource): rel_dir = osp.relpath(dirpath, resource) if rel_dir == '.': rel_dir = '' for fpath in filenames: abs_path = osp.join(dirpath, fpath) rel_hdfs_path = pjoin(hdfs_path, rel_dir, fpath) self.put(rel_hdfs_path, abs_path, overwrite=overwrite, verbose=verbose, **kwargs) else: if is_path: basename = os.path.basename(resource) if self.exists(hdfs_path): if self.status(hdfs_path)['type'] == 'DIRECTORY': hdfs_path = pjoin(hdfs_path, basename) if verbose: self.log('Writing local {0} to HDFS {1}'.format(resource, hdfs_path)) self.client.upload(hdfs_path, resource, overwrite=overwrite, **kwargs) else: if verbose: self.log('Writing buffer to HDFS {0}'.format(hdfs_path)) resource.seek(0) self.client.write(hdfs_path, resource, overwrite=overwrite, **kwargs)
def test_load_data_partition(con, hdfs, tmp_dir, unpart_t, df, temp_table): part_keys = ['year', 'month'] con.create_table(temp_table, schema=unpart_t.schema(), partition=part_keys) part_t = con.table(temp_table) # trim the runtime of this test df = df[df.month == '1'].reset_index(drop=True) unique_keys = df[part_keys].drop_duplicates() hdfs_dir = pjoin(tmp_dir, 'load-data-partition') df2 = df.drop(['year', 'month'], axis='columns') csv_props = {'serialization.format': ',', 'field.delim': ','} for i, (year, month) in enumerate(unique_keys.itertuples(index=False)): chunk = df2[(df.year == year) & (df.month == month)] chunk_path = pjoin(hdfs_dir, '{}.csv'.format(i)) con.write_dataframe(chunk, chunk_path) # test both styles of insert if i: part = {'year': year, 'month': month} else: part = [year, month] part_t.add_partition(part) part_t.alter_partition(part, format='text', serde_properties=csv_props) part_t.load_data(chunk_path, partition=part) hdfs.rmdir(hdfs_dir) verify_partitioned_table(part_t, df, unique_keys)
def test_mv_to_directory(self): remote_file = self._make_random_hdfs_file() dest_dir = pjoin(self.tmp_dir, util.guid()) self.hdfs.mkdir(dest_dir) self.hdfs.mv(remote_file, dest_dir) new_remote_file = pjoin(dest_dir, os.path.basename(remote_file)) file_status = self.hdfs.status(new_remote_file) assert file_status['type'] == 'FILE'
def test_ls(self): test_dir = pjoin(self.tmp_dir, 'ls-test') self.hdfs.mkdir(test_dir) for i in xrange(10): local_path = self._make_random_file() hdfs_path = pjoin(test_dir, local_path) self.hdfs.put(hdfs_path, local_path) assert len(self.hdfs.ls(test_dir)) == 10
def __init__ (self, pic_dir, ** kw) : base = Filename (pic_dir).base name = pjoin (base, u"") pic_dir_abs = sos.path.abspath (pic_dir) self.im_dir = pjoin (pic_dir_abs, "im") self.th_dir = pjoin (pic_dir_abs, "th") self._entries = [] self.__super.__init__ (name = name, pic_dir = pic_dir, ** kw)
def test_create_database_with_location(self): base = pjoin(self.tmp_dir, util.guid()) name = '__ibis_test_{0}'.format(util.guid()) tmp_path = pjoin(base, name) self.con.create_database(name, path=tmp_path) assert self.hdfs.exists(base) self.con.drop_database(name) self.hdfs.rmdir(base)
def test_create_table_with_location(self): base = pjoin(self.tmp_dir, util.guid()) name = "test_{0}".format(util.guid()) tmp_path = pjoin(base, name) expr = self.alltypes table_name = _random_table_name() self.con.create_table(table_name, expr=expr, path=tmp_path, database=self.test_data_db) self.temp_tables.append(".".join([self.test_data_db, table_name])) assert self.hdfs.exists(tmp_path)
def _create_777_tmp_dir(cls): base = pjoin(cls.tmp_dir, util.guid()) tmp_path = pjoin(base, util.guid()) env = IbisTestEnv() superuser_hdfs = ibis.hdfs_connect(host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=(env.auth_mechanism not in ['GSSAPI', 'LDAP']), user=env.hdfs_superuser) superuser_hdfs.mkdir(base) superuser_hdfs.chmod(base, '777') return tmp_path
def test_create_database_with_location(con, tmp_dir, hdfs): base = pjoin(tmp_dir, util.guid()) name = '__ibis_test_{}'.format(util.guid()) tmp_path = pjoin(base, name) con.create_database(name, path=tmp_path) try: assert hdfs.exists(base) finally: try: con.drop_database(name) finally: hdfs.rmdir(base)
def build(self, target_dir=None): # type: (Path) -> Path self._io.writeln(" - Building <info>sdist</info>") if target_dir is None: target_dir = self._path / "dist" if not target_dir.exists(): target_dir.mkdir(parents=True) target = target_dir / "{}-{}.tar.gz".format( self._package.pretty_name, self._meta.version ) gz = GzipFile(target.as_posix(), mode="wb") tar = tarfile.TarFile( target.as_posix(), mode="w", fileobj=gz, format=tarfile.PAX_FORMAT ) try: tar_dir = "{}-{}".format(self._package.pretty_name, self._meta.version) files_to_add = self.find_files_to_add(exclude_build=False) for relpath in files_to_add: path = self._path / relpath tar_info = tar.gettarinfo( str(path), arcname=pjoin(tar_dir, str(relpath)) ) tar_info = self.clean_tarinfo(tar_info) if tar_info.isreg(): with path.open("rb") as f: tar.addfile(tar_info, f) else: tar.addfile(tar_info) # Symlinks & ? setup = self.build_setup() tar_info = tarfile.TarInfo(pjoin(tar_dir, "setup.py")) tar_info.size = len(setup) tar.addfile(tar_info, BytesIO(setup)) pkg_info = self.build_pkg_info() tar_info = tarfile.TarInfo(pjoin(tar_dir, "PKG-INFO")) tar_info.size = len(pkg_info) tar.addfile(tar_info, BytesIO(pkg_info)) finally: tar.close() gz.close() self._io.writeln(" - Built <fg=cyan>{}</>".format(target.name)) return target
def write_temp_csv(self): temp_hdfs_dir = pjoin(options.impala.temp_hdfs_path, 'pandas_{0}'.format(util.guid())) self.hdfs.mkdir(temp_hdfs_dir) # Keep track of the temporary HDFS file self.temp_hdfs_dirs.append(temp_hdfs_dir) # Write the file to HDFS hdfs_path = pjoin(temp_hdfs_dir, '0.csv') self.write_csv(hdfs_path) return temp_hdfs_dir
def test_create_table_with_location_execute( con, hdfs, tmp_dir, alltypes, test_data_db, temp_table ): base = pjoin(tmp_dir, util.guid()) name = 'test_{}'.format(util.guid()) tmp_path = pjoin(base, name) expr = alltypes table_name = temp_table con.create_table( table_name, obj=expr, location=tmp_path, database=test_data_db ) assert hdfs.exists(tmp_path)
def test_get_file_overwrite(self): local_path = self._make_random_file() local_path2 = self._make_random_file() remote_path = pjoin(self.tmp_dir, local_path) self.hdfs.put(remote_path, local_path) remote_path2 = pjoin(self.tmp_dir, local_path2) self.hdfs.put(remote_path2, local_path2) with self.assertRaises(IOError): self.hdfs.get(remote_path, '.') self.hdfs.get(remote_path, local_path2, overwrite=True) assert open(local_path2).read() == open(local_path).read()
def test_get_directory_overwrite(self): local_dir = self._make_test_directory() local_dir2 = self._make_test_directory() remote_dir = pjoin(self.tmp_dir, local_dir) remote_dir2 = pjoin(self.tmp_dir, local_dir2) self.hdfs.put(remote_dir, local_dir) self.hdfs.put(remote_dir2, local_dir2) self.hdfs.get(remote_dir, local_dir2, overwrite=True) _check_directories_equal(local_dir2, local_dir) self.hdfs.get(remote_dir, local_dir2, overwrite=True) _check_directories_equal(local_dir2, local_dir)
def test_size(self): test_dir = pjoin(self.tmp_dir, 'size-test') K = 2048 path = self._make_random_file(size=K) hdfs_path = pjoin(test_dir, path) self.hdfs.put(hdfs_path, path) assert self.hdfs.size(hdfs_path) == K size_test_dir = self._sample_nested_directory() hdfs_path = pjoin(test_dir, size_test_dir) self.hdfs.put(hdfs_path, size_test_dir) assert self.hdfs.size(hdfs_path) == K * 7
def create_parquet_tables(con): parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'parquet')) schemas = { 'functional_alltypes': ibis.schema( [('id', 'int32'), ('bool_col', 'boolean'), ('tinyint_col', 'int8'), ('smallint_col', 'int16'), ('int_col', 'int32'), ('bigint_col', 'int64'), ('float_col', 'float'), ('double_col', 'double'), ('date_string_col', 'string'), ('string_col', 'string'), ('timestamp_col', 'timestamp'), ('year', 'int32'), ('month', 'int32')]), 'tpch_region': ibis.schema( [('r_regionkey', 'int16'), ('r_name', 'string'), ('r_comment', 'string')])} tables = [] for path in parquet_files: head, table_name = posixpath.split(path) print('Creating {0}'.format(table_name)) # if no schema infer! schema = schemas.get(table_name) t = con.parquet_file(path, schema=schema, name=table_name, database=ENV.test_data_db, persist=True) tables.append(t) return tables
def mkdir(self, dir_path, create_parent=False): # ugh, see #252 # create a temporary file, then delete it dummy = pjoin(dir_path, util.guid()) self.client.write(dummy, '') self.client.delete(dummy)
def create_parquet_tables(con): parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, "parquet")) schemas = { "functional_alltypes": ibis.schema( [ ("id", "int32"), ("bool_col", "boolean"), ("tinyint_col", "int8"), ("smallint_col", "int16"), ("int_col", "int32"), ("bigint_col", "int64"), ("float_col", "float"), ("double_col", "double"), ("date_string_col", "string"), ("string_col", "string"), ("timestamp_col", "timestamp"), ("year", "int32"), ("month", "int32"), ] ), "tpch_region": ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")]), } tables = [] for path in parquet_files: head, table_name = posixpath.split(path) print("Creating {0}".format(table_name)) # if no schema infer! schema = schemas.get(table_name) t = con.parquet_file(path, schema=schema, name=table_name, database=ENV.test_data_db, persist=True) tables.append(t) return tables
def test_get_directory_nested_dirs(self): local_dir = util.guid() local_download_dir = util.guid() K = 5 os.mkdir(local_dir) try: for i in xrange(K): self._make_random_file(directory=local_dir) nested_dir = osp.join(local_dir, 'nested-dir') shutil.copytree(local_dir, nested_dir) remote_dir = pjoin(self.tmp_dir, local_dir) self.hdfs.put(remote_dir, local_dir) # download directory and check contents self.hdfs.get(remote_dir, local_download_dir) _check_directories_equal(local_dir, local_download_dir) self._try_delete_directory(local_download_dir) self.hdfs.rmdir(remote_dir) assert not self.hdfs.exists(remote_dir) finally: shutil.rmtree(local_dir)
def put_tarfile(self, hdfs_path, local_path, compression='gzip', verbose=None, overwrite=False): """ Write contents of tar archive to HDFS directly without having to decompress it locally first Parameters ---------- hdfs_path : string local_path : string compression : {'gzip', 'bz2', None} overwrite : boolean, default False verbose : boolean, default None (global default) """ import tarfile modes = { None: 'r', 'gzip': 'r:gz', 'bz2': 'r:bz2' } if compression not in modes: raise ValueError('Invalid compression type {0}' .format(compression)) mode = modes[compression] tf = tarfile.open(local_path, mode=mode) for info in tf: if not info.isfile(): continue buf = tf.extractfile(info) abspath = pjoin(hdfs_path, info.path) self.put(abspath, buf, verbose=verbose, overwrite=overwrite)
def test_get_directory_into_directory(self): local_path1 = self._make_test_directory() local_path2 = self._make_test_directory() remote_path = pjoin(self.tmp_dir, local_path1) self.hdfs.put(remote_path, local_path1) local_path3 = self.hdfs.get(remote_path, local_path2) _check_directories_equal(local_path3, local_path1)
def test_cleanup_tmp_table_on_gc(self): hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region') table = self.con.parquet_file(hdfs_path) name = table.op().name table = None gc.collect() _assert_table_not_exists(self.con, name)
def test_get_directory_overwrite_directory(self): local_path1 = self._make_test_directory() local_path2 = self._make_test_directory() remote_path = pjoin(self.tmp_dir, local_path2) self.hdfs.put(remote_path, local_path1) self.hdfs.get(remote_path, osp.dirname(local_path2), overwrite=True) _check_directories_equal(local_path1, local_path2)
def test_put_get_directory(self): local_dir = util.guid() local_download_dir = util.guid() K = 5 os.mkdir(local_dir) try: for i in range(K): self._make_random_file(directory=local_dir) remote_dir = pjoin(self.tmp_dir, local_dir) self.hdfs.put(remote_dir, local_dir) assert self.hdfs.exists(remote_dir) assert len(self.hdfs.ls(remote_dir)) == K # download directory and check contents self.hdfs.get(remote_dir, local_download_dir) _check_directories_equal(local_dir, local_download_dir) self._try_delete_directory(local_download_dir) self.hdfs.rmdir(remote_dir) assert not self.hdfs.exists(remote_dir) finally: shutil.rmtree(local_dir)
def test_temp_table_concurrency(self): pytest.skip('Cannot get this test to run under pytest') from threading import Thread, Lock import gc nthreads = 4 hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region') lock = Lock() results = [] def do_something(): t = self.con.parquet_file(hdfs_path) with lock: t.limit(10).execute() t = None gc.collect() results.append(True) threads = [] for i in range(nthreads): t = Thread(target=do_something) t.start() threads.append(t) [x.join() for x in threads] assert results == [True] * nthreads
def test_query_avro(self): hdfs_path = pjoin(self.test_data_dir, 'avro/tpch_region_avro') avro_schema = { "fields": [ {"type": ["int", "null"], "name": "R_REGIONKEY"}, {"type": ["string", "null"], "name": "R_NAME"}, {"type": ["string", "null"], "name": "R_COMMENT"}], "type": "record", "name": "a" } table = self.con.avro_file(hdfs_path, avro_schema, database=self.tmp_db) name = table.op().name assert name.startswith('{0}.'.format(self.tmp_db)) # table exists self.con.table(name) expr = table.r_name.value_counts() expr.execute() assert table.count().execute() == 5 df = table.execute() assert len(df) == 5
def test_change_location(con, table, tmp_dir, path_uuid): old_loc = table.metadata().location new_path = pjoin(tmp_dir, 'new-path') table.alter(location=new_path) new_loc = table.metadata().location assert new_loc == old_loc.replace(path_uuid, 'new-path')
def test_change_location(self): old_loc = self.table.metadata().location new_path = pjoin(self.tmp_dir, 'new-path') self.table.alter(location=new_path) new_loc = self.table.metadata().location assert new_loc == old_loc.replace(self.path_uuid, 'new-path')
def joinPath(*path): """ Join two or more paths forming a command word. >>> api.joinPath('/ip', 'address', 'print') >>> '/ip/address/print' """ return pjoin('/', *path).rstrip('/')
def test_put_get_delete_file(self): dirpath = pjoin(self.tmp_dir, 'write-delete-test') self.hdfs.mkdir(dirpath) lpath = self._make_random_file() fpath = pjoin(dirpath, lpath) self.hdfs.put(fpath, lpath) assert self.hdfs.exists(fpath) try: dpath = util.guid() self.hdfs.get(fpath, dpath) assert _contents_equal(dpath, lpath) os.remove(dpath) finally: self.hdfs.rm(fpath) assert not self.hdfs.exists(fpath)
def test_put_get_delete_file(hdfs, tmp_dir, random_file): dirpath = pjoin(tmp_dir, 'write-delete-test') hdfs.mkdir(dirpath) lpath = random_file fpath = pjoin(dirpath, lpath) hdfs.put(fpath, lpath) assert hdfs.exists(fpath) try: dpath = util.guid() hdfs.get(fpath, dpath) assert filecmp.cmp(dpath, lpath, shallow=False) os.remove(dpath) finally: hdfs.rm(fpath) assert not hdfs.exists(fpath)
def get_ibis_test_data(local_path): cmd = 'cd {0} && wget {1} && tar -xzf {2}'.format( local_path, IBIS_TEST_DATA_URL, os.path.basename(IBIS_TEST_DATA_URL)) subprocess.check_call(cmd, shell=True) data_dir = pjoin(local_path, os.path.basename(IBIS_TEST_DATA_URL).split('.', 2)[0]) print('Downloaded {0} and unpacked it to {1}'.format( IBIS_TEST_DATA_URL, data_dir)) return data_dir
def test_put_get_tarfile(hdfs, tmp_dir, tmp_path): test_dir = pjoin(tmp_dir, 'tarfile-test') dirname = sample_nested_directory() try: tf_name = tmp_path / f'{dirname}.tar.gz' subprocess.check_call(f'tar zc {dirname} > {tf_name}', shell=True) randname = util.guid() hdfs_path = pjoin(test_dir, randname) hdfs.put_tarfile(hdfs_path, tf_name, compression='gzip') hdfs.get(hdfs_path, '.') _check_directories_equal(osp.join(randname, dirname), dirname) finally: shutil.rmtree(dirname, ignore_errors=True) shutil.rmtree(osp.join(randname, dirname), ignore_errors=True)
def _gen(parent, src_dir): for f in sos.expanded_globs(pjoin(src_dir, "*.txt")): info = _page_info(f) if info: n = info.get("name", f) info["perma_name"] = base = Filename(n).base info["name"] = name = "%s.html" % (base, ) yield GTW.RST.TOP.Page_ReST \ (parent = parent, src_dir = src_dir, ** info)
def test_cleanup_tmp_table_on_gc(con, test_data_dir): import gc hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') table = con.parquet_file(hdfs_path) name = table.op().name table = None gc.collect() assert not con.exists_table(name)
def test_create_table_persist_fails_if_called_twice(self): tname = util.guid() hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region') self.con.parquet_file(hdfs_path, name=tname, persist=True) self.temp_tables.append(tname) with self.assertRaises(HS2Error): self.con.parquet_file(hdfs_path, name=tname, persist=True)
def test_create_table_persist_fails_if_called_twice(con, temp_table_db, test_data_dir): tmp_db, tname = temp_table_db hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') con.parquet_file(hdfs_path, name=tname, persist=True, database=tmp_db) with pytest.raises(HS2Error): con.parquet_file(hdfs_path, name=tname, persist=True, database=tmp_db)
def test_chown_owner_directory( hdfs_superuser, tmp_dir, random_hdfs_superuser_file, ): new_owner = 'randomowner' path = pjoin(tmp_dir, util.guid()) hdfs_superuser.mkdir(path) hdfs_superuser.chown(path, new_owner) assert hdfs_superuser.status(path)['owner'] == new_owner
def test_close_drops_temp_tables(con, test_data_dir): hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') table = con.parquet_file(hdfs_path) name = table.op().name assert len(con.list_tables(like=name)) con.close() assert not len(con.list_tables(like=name))
def test_chown_group_directory( hdfs_superuser, tmp_dir, random_hdfs_superuser_file, ): new_group = 'randomgroup' path = pjoin(tmp_dir, util.guid()) hdfs_superuser.mkdir(path) hdfs_superuser.chown(path, group=new_group) assert hdfs_superuser.status(path)['group'] == new_group
def write_csv(self): import csv temp_hdfs_dir = pjoin(options.impala.temp_hdfs_path, 'pandas_{0}'.format(util.guid())) tmp_path = 'tmp_{0}.csv'.format(util.guid()) f = open(tmp_path, 'w+') try: # Write the DataFrame to the temporary file path if options.verbose: log('Writing DataFrame to temporary file') self.df.to_csv(f, header=False, index=False, sep=',', quoting=csv.QUOTE_NONE, escapechar='\\', na_rep='#NULL') f.seek(0) # Write the file to HDFS hdfs_path = pjoin(temp_hdfs_dir, '0.csv') if options.verbose: log('Writing CSV to HDFS: {0}'.format(hdfs_path)) self.hdfs.put(hdfs_path, f) # Keep track of the temporary HDFS file self.temp_hdfs_dirs.append(temp_hdfs_dir) self.csv_dir = temp_hdfs_dir finally: f.close() try: os.remove(tmp_path) except os.error: pass return temp_hdfs_dir
def put(self, hdfs_path, resource, overwrite=False, verbose=None, **kwargs): verbose = verbose or options.verbose is_path = isinstance(resource, six.string_types) if is_path and osp.isdir(resource): for dirpath, dirnames, filenames in os.walk(resource): rel_dir = osp.relpath(dirpath, resource) if rel_dir == '.': rel_dir = '' for fpath in filenames: abs_path = osp.join(dirpath, fpath) rel_hdfs_path = pjoin(hdfs_path, rel_dir, fpath) self.put(rel_hdfs_path, abs_path, overwrite=overwrite, verbose=verbose, **kwargs) else: if is_path: basename = os.path.basename(resource) if self.exists(hdfs_path): if self.status(hdfs_path)['type'] == 'DIRECTORY': hdfs_path = pjoin(hdfs_path, basename) if verbose: self.log('Writing local {0} to HDFS {1}'.format( resource, hdfs_path)) self.client.upload(hdfs_path, resource, overwrite=overwrite, **kwargs) else: if verbose: self.log('Writing buffer to HDFS {0}'.format(hdfs_path)) resource.seek(0) self.client.write(hdfs_path, resource, overwrite=overwrite, **kwargs)
def Page_ReST_F(parent, src_dir, name, **kw): src_path = pjoin(src_dir, Filename(".txt", name).name) src_contents = _file_contents(src_path) return GTW.RST.TOP.Page_ReST \ ( parent = parent , src_dir = src_dir , name = name , src_contents = src_contents , ** kw )
def setUpClass(cls): cls.ENV = ENV cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid()) if cls.ENV.use_kerberos: print("Warning: ignoring invalid Certificate Authority errors") cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host, port=cls.ENV.webhdfs_port, use_kerberos=cls.ENV.use_kerberos, verify=(not cls.ENV.use_kerberos)) cls.hdfs.mkdir(cls.tmp_dir)
def add_setup_py(self, files_to_add, target_tarfile): if 'setup.py' in files_to_add: log.warning( "Using setup.py from repository, not generating setup.py") else: setup_py = self.make_setup_py() log.info("Writing generated setup.py") ti = tarfile.TarInfo(pjoin(self.dir_name, 'setup.py')) ti.size = len(setup_py) target_tarfile.addfile(ti, io.BytesIO(setup_py))
def test_query_parquet_file_like_table(self): hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region') ex_schema = ibis.schema([('r_regionkey', 'int16'), ('r_name', 'string'), ('r_comment', 'string')]) table = self.con.parquet_file(hdfs_path, like_table='tpch_region') assert_equal(table.schema(), ex_schema)
def build(self, target_dir: Path = None): if target_dir is None: target_dir = self.ini_path.parent / 'dist' if not target_dir.exists(): target_dir.mkdir(parents=True) target = target_dir / '{}-{}.tar.gz'.format(self.metadata.name, self.metadata.version) tf = tarfile.open(str(target), mode='w:gz') tf_dir = '{}-{}'.format(self.metadata.name, self.metadata.version) files_to_add = self.find_tracked_files() for relpath in files_to_add: path = self.srcdir / relpath tf.add(str(path), arcname=pjoin(tf_dir, relpath)) if 'setup.py' in files_to_add: log.warning( "Using setup.py from repository, not generating setup.py") else: setup_py = self.make_setup_py() log.info("Writing generated setup.py") ti = tarfile.TarInfo(pjoin(tf_dir, 'setup.py')) ti.size = len(setup_py) tf.addfile(ti, io.BytesIO(setup_py)) pkg_info = PKG_INFO.format( name=self.metadata.name, version=self.metadata.version, summary=self.metadata.summary, home_page=self.metadata.home_page, author=self.metadata.author, author_email=self.metadata.author_email, ).encode('utf-8') ti = tarfile.TarInfo(pjoin(tf_dir, 'PKG-INFO')) ti.size = len(pkg_info) tf.addfile(ti, io.BytesIO(pkg_info)) tf.close() log.info("Built sdist: %s", target) return target
def create_udf_data(con): ibis_home = posixpath.dirname(posixpath.dirname(os.path.abspath(__file__))) sep = os.sep path_list = ibis_home.split(sep) path_list += ['testing', 'udf'] udf_dir = sep.join(path_list) build_list = path_list + ['build'] build_dir = sep.join(build_list) subprocess.check_call('cmake . && make', shell=True, cwd=udf_dir) so_dir = pjoin(ENV.test_data_dir, 'udf') con.hdfs.put(so_dir, build_dir, verbose=True)
def test_get_directory_overwrite_file(self): try: local_path1 = self._make_test_directory() local_path2 = self._make_random_file() remote_path = pjoin(self.tmp_dir, local_path1) self.hdfs.put(remote_path, local_path1) self.hdfs.get(remote_path, local_path2, overwrite=True) _check_directories_equal(local_path1, local_path2) finally: # Path changed from file to directory, must be cleaned manually. self._try_delete_directory(local_path2)
def test_size(hdfs, tmp_dir): test_dir = pjoin(tmp_dir, 'size-test') K = 2048 path = make_random_file(size=K) try: hdfs_path = pjoin(test_dir, path) hdfs.put(hdfs_path, path) assert hdfs.size(hdfs_path) == K size_test_dir = sample_nested_directory() try: hdfs_path = pjoin(test_dir, size_test_dir) hdfs.put(hdfs_path, size_test_dir) assert hdfs.size(hdfs_path) == K * 7 finally: shutil.rmtree(size_test_dir) finally: os.remove(path)
def _scrape_dir(path, dst): objs = self.client.list(path) for hpath, detail in objs: relpath = posixpath.relpath(hpath, hdfs_path) full_opath = pjoin(dst, relpath) if detail['type'] == 'FILE': _get_file(hpath, full_opath) else: os.makedirs(full_opath) _scrape_dir(hpath, dst)
def test_temp_table_concurrency(con, test_data_dir): def limit_10(i, hdfs_path): t = con.parquet_file(hdfs_path) return t.sort_by(t.r_regionkey).limit(1, offset=i).execute() nthreads = 4 hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as e: futures = [e.submit(limit_10, i, hdfs_path) for i in range(nthreads)] assert all(len(future.result()) for future in futures)
def build(self, target_dir, gen_setup_py=True): os.makedirs(str(target_dir), exist_ok=True) target = target_dir / '{}-{}.tar.gz'.format( self.metadata.name, self.metadata.version ) source_date_epoch = os.environ.get('SOURCE_DATE_EPOCH', '') mtime = int(source_date_epoch) if source_date_epoch else None gz = GzipFile(str(target), mode='wb', mtime=mtime) tf = tarfile.TarFile(str(target), mode='w', fileobj=gz, format=tarfile.PAX_FORMAT) try: files_to_add = self.apply_includes_excludes(self.select_files()) for relpath in files_to_add: path = str(self.cfgdir / relpath) ti = tf.gettarinfo(path, arcname=pjoin(self.dir_name, relpath)) ti = clean_tarinfo(ti, mtime) if ti.isreg(): with open(path, 'rb') as f: tf.addfile(ti, f) else: tf.addfile(ti) # Symlinks & ? if gen_setup_py: self.add_setup_py(files_to_add, tf) stream = io.StringIO() self.metadata.write_metadata_file(stream) pkg_info = stream.getvalue().encode() ti = tarfile.TarInfo(pjoin(self.dir_name, 'PKG-INFO')) ti.size = len(pkg_info) tf.addfile(ti, io.BytesIO(pkg_info)) finally: tf.close() gz.close() log.info("Built sdist: %s", target) return target
def test_load_data_partition(self): df = self.df unpart_t = self.db.table(self.pd_name) part_keys = ['year', 'month'] part_t = self._create_partitioned_table(unpart_t.schema(), part_keys) # trim the runtime of this test df = df[df.month == '1'].reset_index(drop=True) unique_keys = df[part_keys].drop_duplicates() hdfs_dir = pjoin(self.tmp_dir, 'load-data-partition') df2 = df.drop(['year', 'month'], axis='columns') csv_props = { 'serialization.format': ',', 'field.delim': ',' } for i, (year, month) in enumerate(unique_keys.itertuples(index=False)): chunk = df2[(df.year == year) & (df.month == month)] chunk_path = pjoin(hdfs_dir, '{0}.csv'.format(i)) self.con.write_dataframe(chunk, chunk_path) # test both styles of insert if i: part = {'year': year, 'month': month} else: part = [year, month] part_t.add_partition(part) part_t.alter_partition(part, format='text', serde_properties=csv_props) part_t.load_data(chunk_path, partition=part) self.hdfs.rmdir(hdfs_dir) self._verify_partitioned_table(part_t, df, unique_keys)
def test_close_drops_temp_tables(con, test_data_dir): from posixpath import join as pjoin hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') table = con.parquet_file(hdfs_path) name = table.op().name assert con.exists_table(name) is True con.close() assert not con.exists_table(name)
def setUpClass(cls): cls.ENV = ENV cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid()) if cls.ENV.auth_mechanism in ['GSSAPI', 'LDAP']: print("Warning: ignoring invalid Certificate Authority errors") cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host, port=cls.ENV.webhdfs_port, auth_mechanism=cls.ENV.auth_mechanism, verify=(cls.ENV.auth_mechanism not in ['GSSAPI', 'LDAP']), user=cls.ENV.webhdfs_user) cls.hdfs.mkdir(cls.tmp_dir)
def test_query_parquet_infer_schema(self): hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region') table = self.con.parquet_file(hdfs_path) # NOTE: the actual schema should have an int16, but bc this is being # inferred from a parquet file, which has no notion of int16, the # inferred schema will have an int32 instead. ex_schema = ibis.schema([('r_regionkey', 'int32'), ('r_name', 'string'), ('r_comment', 'string')]) assert_equal(table.schema(), ex_schema)
def setUpClass(cls): cls.ENV = ENV cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid()) if cls.ENV.use_kerberos: print("Warning: ignoring invalid Certificate Authority errors") # NOTE: specifying superuser as set in IbisTestEnv cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host, port=cls.ENV.webhdfs_port, use_kerberos=cls.ENV.use_kerberos, verify=(not cls.ENV.use_kerberos), user=cls.ENV.hdfs_superuser) cls.hdfs.mkdir(cls.tmp_dir)
def find_packages(cls, include): """ Discover subpackages and data. It also retrieves necessary files. """ pkgdir = None if include.source is not None: pkgdir = str(include.base) base = str(include.elements[0].parent) pkg_name = include.package pkg_data = defaultdict(list) # Undocumented distutils feature: # the empty string matches all package names pkg_data[""].append("*") packages = [pkg_name] subpkg_paths = set() def find_nearest_pkg(rel_path): parts = rel_path.split(os.sep) for i in reversed(range(1, len(parts))): ancestor = "/".join(parts[:i]) if ancestor in subpkg_paths: pkg = ".".join([pkg_name] + parts[:i]) return pkg, "/".join(parts[i:]) # Relative to the top-level package return pkg_name, rel_path for path, dirnames, filenames in os.walk(str(base), topdown=True): if os.path.basename(path) == "__pycache__": continue from_top_level = os.path.relpath(path, base) if from_top_level == ".": continue is_subpkg = "__init__.py" in filenames if is_subpkg: subpkg_paths.add(from_top_level) parts = from_top_level.split(os.sep) packages.append(".".join([pkg_name] + parts)) else: pkg, from_nearest_pkg = find_nearest_pkg(from_top_level) pkg_data[pkg].append(pjoin(from_nearest_pkg, "*")) # Sort values in pkg_data pkg_data = {k: sorted(v) for (k, v) in pkg_data.items()} return pkgdir, sorted(packages), pkg_data