예제 #1
0
파일: client.py 프로젝트: nataliaking/ibis
    def pandas(self, df, name=None, database=None, persist=False):
        """
        Create a (possibly temp) parquet table from a local pandas DataFrame.
        """
        name, database = self._get_concrete_table_path(name, database,
                                                       persist=persist)
        qualified_name = self._fully_qualified_name(name, database)

        # write df to a temp CSV file on HDFS
        temp_csv_hdfs_dir = pjoin(options.impala.temp_hdfs_path, util.guid())
        buf = BytesIO()
        df.to_csv(buf, header=False, index=False, na_rep='\N')
        self.hdfs.put(pjoin(temp_csv_hdfs_dir, '0.csv'), buf)

        # define a temporary table using delimited data
        schema = util.pandas_to_ibis_schema(df)
        table = self.delimited_file(
            temp_csv_hdfs_dir, schema,
            name='ibis_tmp_pandas_{0}'.format(util.guid()), database=database,
            external=True, persist=False)

        # CTAS into Parquet
        self.create_table(name, expr=table, database=database,
                          format='parquet', overwrite=False)

        # cleanup
        self.hdfs.delete(temp_csv_hdfs_dir, recursive=True)

        return self._wrap_new_table(qualified_name, persist)
예제 #2
0
    def put(self, hdfs_path, resource, overwrite=False, verbose=None,
            **kwargs):
        verbose = verbose or options.verbose
        is_path = isinstance(resource, six.string_types)

        if is_path and osp.isdir(resource):
            for dirpath, dirnames, filenames in os.walk(resource):
                rel_dir = osp.relpath(dirpath, resource)
                if rel_dir == '.':
                    rel_dir = ''
                for fpath in filenames:
                    abs_path = osp.join(dirpath, fpath)
                    rel_hdfs_path = pjoin(hdfs_path, rel_dir, fpath)
                    self.put(rel_hdfs_path, abs_path, overwrite=overwrite,
                             verbose=verbose, **kwargs)
        else:
            if is_path:
                basename = os.path.basename(resource)
                if self.exists(hdfs_path):
                    if self.status(hdfs_path)['type'] == 'DIRECTORY':
                        hdfs_path = pjoin(hdfs_path, basename)
                if verbose:
                    self.log('Writing local {0} to HDFS {1}'.format(resource,
                                                                    hdfs_path))
                self.client.upload(hdfs_path, resource,
                                   overwrite=overwrite, **kwargs)
            else:
                if verbose:
                    self.log('Writing buffer to HDFS {0}'.format(hdfs_path))
                resource.seek(0)
                self.client.write(hdfs_path, resource, overwrite=overwrite,
                                  **kwargs)
예제 #3
0
def test_load_data_partition(con, hdfs, tmp_dir, unpart_t, df, temp_table):
    part_keys = ['year', 'month']

    con.create_table(temp_table, schema=unpart_t.schema(), partition=part_keys)
    part_t = con.table(temp_table)

    # trim the runtime of this test
    df = df[df.month == '1'].reset_index(drop=True)

    unique_keys = df[part_keys].drop_duplicates()

    hdfs_dir = pjoin(tmp_dir, 'load-data-partition')

    df2 = df.drop(['year', 'month'], axis='columns')

    csv_props = {'serialization.format': ',', 'field.delim': ','}

    for i, (year, month) in enumerate(unique_keys.itertuples(index=False)):
        chunk = df2[(df.year == year) & (df.month == month)]
        chunk_path = pjoin(hdfs_dir, '{}.csv'.format(i))

        con.write_dataframe(chunk, chunk_path)

        # test both styles of insert
        if i:
            part = {'year': year, 'month': month}
        else:
            part = [year, month]

        part_t.add_partition(part)
        part_t.alter_partition(part, format='text', serde_properties=csv_props)
        part_t.load_data(chunk_path, partition=part)

    hdfs.rmdir(hdfs_dir)
    verify_partitioned_table(part_t, df, unique_keys)
예제 #4
0
 def test_mv_to_directory(self):
     remote_file = self._make_random_hdfs_file()
     dest_dir = pjoin(self.tmp_dir, util.guid())
     self.hdfs.mkdir(dest_dir)
     self.hdfs.mv(remote_file, dest_dir)
     new_remote_file = pjoin(dest_dir, os.path.basename(remote_file))
     file_status = self.hdfs.status(new_remote_file)
     assert file_status['type'] == 'FILE'
예제 #5
0
 def test_ls(self):
     test_dir = pjoin(self.tmp_dir, 'ls-test')
     self.hdfs.mkdir(test_dir)
     for i in xrange(10):
         local_path = self._make_random_file()
         hdfs_path = pjoin(test_dir, local_path)
         self.hdfs.put(hdfs_path, local_path)
     assert len(self.hdfs.ls(test_dir)) == 10
예제 #6
0
파일: Gallery.py 프로젝트: Tapyr/tapyr
 def __init__ (self, pic_dir, ** kw) :
     base           = Filename (pic_dir).base
     name           = pjoin (base, u"")
     pic_dir_abs    = sos.path.abspath (pic_dir)
     self.im_dir    = pjoin (pic_dir_abs, "im")
     self.th_dir    = pjoin (pic_dir_abs, "th")
     self._entries  = []
     self.__super.__init__ (name = name, pic_dir = pic_dir, ** kw)
예제 #7
0
파일: test_ddl.py 프로젝트: koverholt/ibis
    def test_create_database_with_location(self):
        base = pjoin(self.tmp_dir, util.guid())
        name = '__ibis_test_{0}'.format(util.guid())
        tmp_path = pjoin(base, name)

        self.con.create_database(name, path=tmp_path)
        assert self.hdfs.exists(base)
        self.con.drop_database(name)
        self.hdfs.rmdir(base)
예제 #8
0
    def test_create_table_with_location(self):
        base = pjoin(self.tmp_dir, util.guid())
        name = "test_{0}".format(util.guid())
        tmp_path = pjoin(base, name)

        expr = self.alltypes
        table_name = _random_table_name()

        self.con.create_table(table_name, expr=expr, path=tmp_path, database=self.test_data_db)
        self.temp_tables.append(".".join([self.test_data_db, table_name]))
        assert self.hdfs.exists(tmp_path)
예제 #9
0
파일: common.py 프로젝트: cloudorn/ibis
 def _create_777_tmp_dir(cls):
     base = pjoin(cls.tmp_dir, util.guid())
     tmp_path = pjoin(base, util.guid())
     env = IbisTestEnv()
     superuser_hdfs = ibis.hdfs_connect(host=env.nn_host,
                                        port=env.webhdfs_port,
                                        auth_mechanism=env.auth_mechanism,
                                        verify=(env.auth_mechanism
                                                not in ['GSSAPI', 'LDAP']),
                                        user=env.hdfs_superuser)
     superuser_hdfs.mkdir(base)
     superuser_hdfs.chmod(base, '777')
     return tmp_path
예제 #10
0
파일: test_ddl.py 프로젝트: cloudera/ibis
def test_create_database_with_location(con, tmp_dir, hdfs):
    base = pjoin(tmp_dir, util.guid())
    name = '__ibis_test_{}'.format(util.guid())
    tmp_path = pjoin(base, name)

    con.create_database(name, path=tmp_path)
    try:
        assert hdfs.exists(base)
    finally:
        try:
            con.drop_database(name)
        finally:
            hdfs.rmdir(base)
예제 #11
0
파일: sdist.py 프로젝트: shawegit/poetry
    def build(self, target_dir=None):  # type: (Path) -> Path
        self._io.writeln(" - Building <info>sdist</info>")
        if target_dir is None:
            target_dir = self._path / "dist"

        if not target_dir.exists():
            target_dir.mkdir(parents=True)

        target = target_dir / "{}-{}.tar.gz".format(
            self._package.pretty_name, self._meta.version
        )
        gz = GzipFile(target.as_posix(), mode="wb")
        tar = tarfile.TarFile(
            target.as_posix(), mode="w", fileobj=gz, format=tarfile.PAX_FORMAT
        )

        try:
            tar_dir = "{}-{}".format(self._package.pretty_name, self._meta.version)

            files_to_add = self.find_files_to_add(exclude_build=False)

            for relpath in files_to_add:
                path = self._path / relpath
                tar_info = tar.gettarinfo(
                    str(path), arcname=pjoin(tar_dir, str(relpath))
                )
                tar_info = self.clean_tarinfo(tar_info)

                if tar_info.isreg():
                    with path.open("rb") as f:
                        tar.addfile(tar_info, f)
                else:
                    tar.addfile(tar_info)  # Symlinks & ?

            setup = self.build_setup()
            tar_info = tarfile.TarInfo(pjoin(tar_dir, "setup.py"))
            tar_info.size = len(setup)
            tar.addfile(tar_info, BytesIO(setup))

            pkg_info = self.build_pkg_info()

            tar_info = tarfile.TarInfo(pjoin(tar_dir, "PKG-INFO"))
            tar_info.size = len(pkg_info)
            tar.addfile(tar_info, BytesIO(pkg_info))
        finally:
            tar.close()
            gz.close()

        self._io.writeln(" - Built <fg=cyan>{}</>".format(target.name))

        return target
예제 #12
0
    def write_temp_csv(self):
        temp_hdfs_dir = pjoin(options.impala.temp_hdfs_path,
                              'pandas_{0}'.format(util.guid()))
        self.hdfs.mkdir(temp_hdfs_dir)

        # Keep track of the temporary HDFS file
        self.temp_hdfs_dirs.append(temp_hdfs_dir)

        # Write the file to HDFS
        hdfs_path = pjoin(temp_hdfs_dir, '0.csv')

        self.write_csv(hdfs_path)

        return temp_hdfs_dir
예제 #13
0
파일: test_ddl.py 프로젝트: cloudera/ibis
def test_create_table_with_location_execute(
    con, hdfs, tmp_dir, alltypes, test_data_db, temp_table
):
    base = pjoin(tmp_dir, util.guid())
    name = 'test_{}'.format(util.guid())
    tmp_path = pjoin(base, name)

    expr = alltypes
    table_name = temp_table

    con.create_table(
        table_name, obj=expr, location=tmp_path, database=test_data_db
    )
    assert hdfs.exists(tmp_path)
예제 #14
0
    def test_get_file_overwrite(self):
        local_path = self._make_random_file()
        local_path2 = self._make_random_file()

        remote_path = pjoin(self.tmp_dir, local_path)
        self.hdfs.put(remote_path, local_path)

        remote_path2 = pjoin(self.tmp_dir, local_path2)
        self.hdfs.put(remote_path2, local_path2)

        with self.assertRaises(IOError):
            self.hdfs.get(remote_path, '.')

        self.hdfs.get(remote_path, local_path2, overwrite=True)
        assert open(local_path2).read() == open(local_path).read()
예제 #15
0
    def test_get_directory_overwrite(self):
        local_dir = self._make_test_directory()
        local_dir2 = self._make_test_directory()

        remote_dir = pjoin(self.tmp_dir, local_dir)
        remote_dir2 = pjoin(self.tmp_dir, local_dir2)

        self.hdfs.put(remote_dir, local_dir)
        self.hdfs.put(remote_dir2, local_dir2)

        self.hdfs.get(remote_dir, local_dir2, overwrite=True)
        _check_directories_equal(local_dir2, local_dir)

        self.hdfs.get(remote_dir, local_dir2, overwrite=True)
        _check_directories_equal(local_dir2, local_dir)
예제 #16
0
    def test_size(self):
        test_dir = pjoin(self.tmp_dir, 'size-test')

        K = 2048
        path = self._make_random_file(size=K)
        hdfs_path = pjoin(test_dir, path)
        self.hdfs.put(hdfs_path, path)
        assert self.hdfs.size(hdfs_path) == K

        size_test_dir = self._sample_nested_directory()

        hdfs_path = pjoin(test_dir, size_test_dir)
        self.hdfs.put(hdfs_path, size_test_dir)

        assert self.hdfs.size(hdfs_path) == K * 7
예제 #17
0
def create_parquet_tables(con):
    parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'parquet'))
    schemas = {
        'functional_alltypes': ibis.schema(
            [('id', 'int32'),
             ('bool_col', 'boolean'),
             ('tinyint_col', 'int8'),
             ('smallint_col', 'int16'),
             ('int_col', 'int32'),
             ('bigint_col', 'int64'),
             ('float_col', 'float'),
             ('double_col', 'double'),
             ('date_string_col', 'string'),
             ('string_col', 'string'),
             ('timestamp_col', 'timestamp'),
             ('year', 'int32'),
             ('month', 'int32')]),
        'tpch_region': ibis.schema(
            [('r_regionkey', 'int16'),
             ('r_name', 'string'),
             ('r_comment', 'string')])}

    tables = []

    for path in parquet_files:
        head, table_name = posixpath.split(path)
        print('Creating {0}'.format(table_name))
        # if no schema infer!
        schema = schemas.get(table_name)
        t = con.parquet_file(path, schema=schema, name=table_name,
                             database=ENV.test_data_db, persist=True)
        tables.append(t)

    return tables
예제 #18
0
    def mkdir(self, dir_path, create_parent=False):
        # ugh, see #252

        # create a temporary file, then delete it
        dummy = pjoin(dir_path, util.guid())
        self.client.write(dummy, '')
        self.client.delete(dummy)
예제 #19
0
def create_parquet_tables(con):
    parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, "parquet"))
    schemas = {
        "functional_alltypes": ibis.schema(
            [
                ("id", "int32"),
                ("bool_col", "boolean"),
                ("tinyint_col", "int8"),
                ("smallint_col", "int16"),
                ("int_col", "int32"),
                ("bigint_col", "int64"),
                ("float_col", "float"),
                ("double_col", "double"),
                ("date_string_col", "string"),
                ("string_col", "string"),
                ("timestamp_col", "timestamp"),
                ("year", "int32"),
                ("month", "int32"),
            ]
        ),
        "tpch_region": ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")]),
    }

    tables = []

    for path in parquet_files:
        head, table_name = posixpath.split(path)
        print("Creating {0}".format(table_name))
        # if no schema infer!
        schema = schemas.get(table_name)
        t = con.parquet_file(path, schema=schema, name=table_name, database=ENV.test_data_db, persist=True)
        tables.append(t)

    return tables
예제 #20
0
    def test_get_directory_nested_dirs(self):
        local_dir = util.guid()
        local_download_dir = util.guid()

        K = 5

        os.mkdir(local_dir)

        try:
            for i in xrange(K):
                self._make_random_file(directory=local_dir)

            nested_dir = osp.join(local_dir, 'nested-dir')
            shutil.copytree(local_dir, nested_dir)

            remote_dir = pjoin(self.tmp_dir, local_dir)
            self.hdfs.put(remote_dir, local_dir)

            # download directory and check contents
            self.hdfs.get(remote_dir, local_download_dir)

            _check_directories_equal(local_dir, local_download_dir)

            self._try_delete_directory(local_download_dir)

            self.hdfs.rmdir(remote_dir)
            assert not self.hdfs.exists(remote_dir)
        finally:
            shutil.rmtree(local_dir)
예제 #21
0
    def put_tarfile(self, hdfs_path, local_path, compression='gzip',
                    verbose=None, overwrite=False):
        """
        Write contents of tar archive to HDFS directly without having to
        decompress it locally first

        Parameters
        ----------
        hdfs_path : string
        local_path : string
        compression : {'gzip', 'bz2', None}
        overwrite : boolean, default False
        verbose : boolean, default None (global default)
        """
        import tarfile
        modes = {
            None: 'r',
            'gzip': 'r:gz',
            'bz2': 'r:bz2'
        }

        if compression not in modes:
            raise ValueError('Invalid compression type {0}'
                             .format(compression))
        mode = modes[compression]

        tf = tarfile.open(local_path, mode=mode)
        for info in tf:
            if not info.isfile():
                continue

            buf = tf.extractfile(info)
            abspath = pjoin(hdfs_path, info.path)
            self.put(abspath, buf, verbose=verbose, overwrite=overwrite)
예제 #22
0
 def test_get_directory_into_directory(self):
     local_path1 = self._make_test_directory()
     local_path2 = self._make_test_directory()
     remote_path = pjoin(self.tmp_dir, local_path1)
     self.hdfs.put(remote_path, local_path1)
     local_path3 = self.hdfs.get(remote_path, local_path2)
     _check_directories_equal(local_path3, local_path1)
예제 #23
0
파일: test_ddl.py 프로젝트: koverholt/ibis
 def test_cleanup_tmp_table_on_gc(self):
     hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')
     table = self.con.parquet_file(hdfs_path)
     name = table.op().name
     table = None
     gc.collect()
     _assert_table_not_exists(self.con, name)
예제 #24
0
 def test_get_directory_overwrite_directory(self):
     local_path1 = self._make_test_directory()
     local_path2 = self._make_test_directory()
     remote_path = pjoin(self.tmp_dir, local_path2)
     self.hdfs.put(remote_path, local_path1)
     self.hdfs.get(remote_path, osp.dirname(local_path2), overwrite=True)
     _check_directories_equal(local_path1, local_path2)
예제 #25
0
    def test_put_get_directory(self):
        local_dir = util.guid()
        local_download_dir = util.guid()

        K = 5

        os.mkdir(local_dir)

        try:
            for i in range(K):
                self._make_random_file(directory=local_dir)

            remote_dir = pjoin(self.tmp_dir, local_dir)
            self.hdfs.put(remote_dir, local_dir)

            assert self.hdfs.exists(remote_dir)
            assert len(self.hdfs.ls(remote_dir)) == K

            # download directory and check contents
            self.hdfs.get(remote_dir, local_download_dir)

            _check_directories_equal(local_dir, local_download_dir)

            self._try_delete_directory(local_download_dir)

            self.hdfs.rmdir(remote_dir)
            assert not self.hdfs.exists(remote_dir)
        finally:
            shutil.rmtree(local_dir)
예제 #26
0
파일: test_ddl.py 프로젝트: koverholt/ibis
    def test_temp_table_concurrency(self):
        pytest.skip('Cannot get this test to run under pytest')

        from threading import Thread, Lock
        import gc
        nthreads = 4

        hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')

        lock = Lock()

        results = []

        def do_something():
            t = self.con.parquet_file(hdfs_path)

            with lock:
                t.limit(10).execute()
                t = None
                gc.collect()
                results.append(True)

        threads = []
        for i in range(nthreads):
            t = Thread(target=do_something)
            t.start()
            threads.append(t)

        [x.join() for x in threads]

        assert results == [True] * nthreads
예제 #27
0
파일: test_ddl.py 프로젝트: koverholt/ibis
    def test_query_avro(self):
        hdfs_path = pjoin(self.test_data_dir, 'avro/tpch_region_avro')

        avro_schema = {
            "fields": [
                {"type": ["int", "null"], "name": "R_REGIONKEY"},
                {"type": ["string", "null"], "name": "R_NAME"},
                {"type": ["string", "null"], "name": "R_COMMENT"}],
            "type": "record",
            "name": "a"
        }

        table = self.con.avro_file(hdfs_path, avro_schema,
                                   database=self.tmp_db)

        name = table.op().name
        assert name.startswith('{0}.'.format(self.tmp_db))

        # table exists
        self.con.table(name)

        expr = table.r_name.value_counts()
        expr.execute()

        assert table.count().execute() == 5

        df = table.execute()
        assert len(df) == 5
예제 #28
0
파일: test_ddl.py 프로젝트: cloudera/ibis
def test_change_location(con, table, tmp_dir, path_uuid):
    old_loc = table.metadata().location

    new_path = pjoin(tmp_dir, 'new-path')
    table.alter(location=new_path)

    new_loc = table.metadata().location
    assert new_loc == old_loc.replace(path_uuid, 'new-path')
예제 #29
0
파일: test_ddl.py 프로젝트: koverholt/ibis
    def test_change_location(self):
        old_loc = self.table.metadata().location

        new_path = pjoin(self.tmp_dir, 'new-path')
        self.table.alter(location=new_path)

        new_loc = self.table.metadata().location
        assert new_loc == old_loc.replace(self.path_uuid, 'new-path')
예제 #30
0
파일: api.py 프로젝트: luqasz/librouteros
    def joinPath(*path):
        """
        Join two or more paths forming a command word.

        >>> api.joinPath('/ip', 'address', 'print')
        >>> '/ip/address/print'
        """
        return pjoin('/', *path).rstrip('/')
예제 #31
0
    def test_put_get_delete_file(self):
        dirpath = pjoin(self.tmp_dir, 'write-delete-test')
        self.hdfs.mkdir(dirpath)

        lpath = self._make_random_file()
        fpath = pjoin(dirpath, lpath)

        self.hdfs.put(fpath, lpath)
        assert self.hdfs.exists(fpath)

        try:
            dpath = util.guid()
            self.hdfs.get(fpath, dpath)
            assert _contents_equal(dpath, lpath)
            os.remove(dpath)
        finally:
            self.hdfs.rm(fpath)
            assert not self.hdfs.exists(fpath)
예제 #32
0
def test_put_get_delete_file(hdfs, tmp_dir, random_file):
    dirpath = pjoin(tmp_dir, 'write-delete-test')
    hdfs.mkdir(dirpath)

    lpath = random_file
    fpath = pjoin(dirpath, lpath)

    hdfs.put(fpath, lpath)
    assert hdfs.exists(fpath)

    try:
        dpath = util.guid()
        hdfs.get(fpath, dpath)
        assert filecmp.cmp(dpath, lpath, shallow=False)
        os.remove(dpath)
    finally:
        hdfs.rm(fpath)
        assert not hdfs.exists(fpath)
예제 #33
0
def get_ibis_test_data(local_path):
    cmd = 'cd {0} && wget {1} && tar -xzf {2}'.format(
        local_path, IBIS_TEST_DATA_URL, os.path.basename(IBIS_TEST_DATA_URL))
    subprocess.check_call(cmd, shell=True)
    data_dir = pjoin(local_path,
                     os.path.basename(IBIS_TEST_DATA_URL).split('.', 2)[0])
    print('Downloaded {0} and unpacked it to {1}'.format(
        IBIS_TEST_DATA_URL, data_dir))
    return data_dir
예제 #34
0
def test_put_get_tarfile(hdfs, tmp_dir, tmp_path):
    test_dir = pjoin(tmp_dir, 'tarfile-test')

    dirname = sample_nested_directory()

    try:

        tf_name = tmp_path / f'{dirname}.tar.gz'
        subprocess.check_call(f'tar zc {dirname} > {tf_name}', shell=True)
        randname = util.guid()
        hdfs_path = pjoin(test_dir, randname)
        hdfs.put_tarfile(hdfs_path, tf_name, compression='gzip')

        hdfs.get(hdfs_path, '.')
        _check_directories_equal(osp.join(randname, dirname), dirname)
    finally:
        shutil.rmtree(dirname, ignore_errors=True)
        shutil.rmtree(osp.join(randname, dirname), ignore_errors=True)
예제 #35
0
 def _gen(parent, src_dir):
     for f in sos.expanded_globs(pjoin(src_dir, "*.txt")):
         info = _page_info(f)
         if info:
             n = info.get("name", f)
             info["perma_name"] = base = Filename(n).base
             info["name"] = name = "%s.html" % (base, )
             yield GTW.RST.TOP.Page_ReST \
                 (parent = parent, src_dir = src_dir, ** info)
예제 #36
0
def test_cleanup_tmp_table_on_gc(con, test_data_dir):
    import gc

    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')
    table = con.parquet_file(hdfs_path)
    name = table.op().name
    table = None
    gc.collect()
    assert not con.exists_table(name)
예제 #37
0
    def test_create_table_persist_fails_if_called_twice(self):
        tname = util.guid()

        hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')
        self.con.parquet_file(hdfs_path, name=tname, persist=True)
        self.temp_tables.append(tname)

        with self.assertRaises(HS2Error):
            self.con.parquet_file(hdfs_path, name=tname, persist=True)
예제 #38
0
def test_create_table_persist_fails_if_called_twice(con, temp_table_db,
                                                    test_data_dir):
    tmp_db, tname = temp_table_db

    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')
    con.parquet_file(hdfs_path, name=tname, persist=True, database=tmp_db)

    with pytest.raises(HS2Error):
        con.parquet_file(hdfs_path, name=tname, persist=True, database=tmp_db)
예제 #39
0
def test_chown_owner_directory(
    hdfs_superuser,
    tmp_dir,
    random_hdfs_superuser_file,
):
    new_owner = 'randomowner'
    path = pjoin(tmp_dir, util.guid())
    hdfs_superuser.mkdir(path)
    hdfs_superuser.chown(path, new_owner)
    assert hdfs_superuser.status(path)['owner'] == new_owner
예제 #40
0
def test_close_drops_temp_tables(con, test_data_dir):
    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')

    table = con.parquet_file(hdfs_path)

    name = table.op().name
    assert len(con.list_tables(like=name))
    con.close()

    assert not len(con.list_tables(like=name))
예제 #41
0
def test_chown_group_directory(
    hdfs_superuser,
    tmp_dir,
    random_hdfs_superuser_file,
):
    new_group = 'randomgroup'
    path = pjoin(tmp_dir, util.guid())
    hdfs_superuser.mkdir(path)
    hdfs_superuser.chown(path, group=new_group)
    assert hdfs_superuser.status(path)['group'] == new_group
예제 #42
0
    def write_csv(self):
        import csv

        temp_hdfs_dir = pjoin(options.impala.temp_hdfs_path,
                              'pandas_{0}'.format(util.guid()))

        tmp_path = 'tmp_{0}.csv'.format(util.guid())
        f = open(tmp_path, 'w+')

        try:
            # Write the DataFrame to the temporary file path
            if options.verbose:
                log('Writing DataFrame to temporary file')

            self.df.to_csv(f,
                           header=False,
                           index=False,
                           sep=',',
                           quoting=csv.QUOTE_NONE,
                           escapechar='\\',
                           na_rep='#NULL')
            f.seek(0)

            # Write the file to HDFS
            hdfs_path = pjoin(temp_hdfs_dir, '0.csv')

            if options.verbose:
                log('Writing CSV to HDFS: {0}'.format(hdfs_path))

            self.hdfs.put(hdfs_path, f)

            # Keep track of the temporary HDFS file
            self.temp_hdfs_dirs.append(temp_hdfs_dir)

            self.csv_dir = temp_hdfs_dir
        finally:
            f.close()
            try:
                os.remove(tmp_path)
            except os.error:
                pass

        return temp_hdfs_dir
예제 #43
0
    def put(self,
            hdfs_path,
            resource,
            overwrite=False,
            verbose=None,
            **kwargs):
        verbose = verbose or options.verbose
        is_path = isinstance(resource, six.string_types)

        if is_path and osp.isdir(resource):
            for dirpath, dirnames, filenames in os.walk(resource):
                rel_dir = osp.relpath(dirpath, resource)
                if rel_dir == '.':
                    rel_dir = ''
                for fpath in filenames:
                    abs_path = osp.join(dirpath, fpath)
                    rel_hdfs_path = pjoin(hdfs_path, rel_dir, fpath)
                    self.put(rel_hdfs_path,
                             abs_path,
                             overwrite=overwrite,
                             verbose=verbose,
                             **kwargs)
        else:
            if is_path:
                basename = os.path.basename(resource)
                if self.exists(hdfs_path):
                    if self.status(hdfs_path)['type'] == 'DIRECTORY':
                        hdfs_path = pjoin(hdfs_path, basename)
                if verbose:
                    self.log('Writing local {0} to HDFS {1}'.format(
                        resource, hdfs_path))
                self.client.upload(hdfs_path,
                                   resource,
                                   overwrite=overwrite,
                                   **kwargs)
            else:
                if verbose:
                    self.log('Writing buffer to HDFS {0}'.format(hdfs_path))
                resource.seek(0)
                self.client.write(hdfs_path,
                                  resource,
                                  overwrite=overwrite,
                                  **kwargs)
예제 #44
0
def Page_ReST_F(parent, src_dir, name, **kw):
    src_path = pjoin(src_dir, Filename(".txt", name).name)
    src_contents = _file_contents(src_path)
    return GTW.RST.TOP.Page_ReST \
        ( parent       = parent
        , src_dir      = src_dir
        , name         = name
        , src_contents = src_contents
        , ** kw
        )
예제 #45
0
 def setUpClass(cls):
     cls.ENV = ENV
     cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid())
     if cls.ENV.use_kerberos:
         print("Warning: ignoring invalid Certificate Authority errors")
     cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host,
                                  port=cls.ENV.webhdfs_port,
                                  use_kerberos=cls.ENV.use_kerberos,
                                  verify=(not cls.ENV.use_kerberos))
     cls.hdfs.mkdir(cls.tmp_dir)
예제 #46
0
 def add_setup_py(self, files_to_add, target_tarfile):
     if 'setup.py' in files_to_add:
         log.warning(
             "Using setup.py from repository, not generating setup.py")
     else:
         setup_py = self.make_setup_py()
         log.info("Writing generated setup.py")
         ti = tarfile.TarInfo(pjoin(self.dir_name, 'setup.py'))
         ti.size = len(setup_py)
         target_tarfile.addfile(ti, io.BytesIO(setup_py))
예제 #47
0
    def test_query_parquet_file_like_table(self):
        hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')

        ex_schema = ibis.schema([('r_regionkey', 'int16'),
                                 ('r_name', 'string'),
                                 ('r_comment', 'string')])

        table = self.con.parquet_file(hdfs_path, like_table='tpch_region')

        assert_equal(table.schema(), ex_schema)
예제 #48
0
파일: sdist.py 프로젝트: choldgraf/flit
    def build(self, target_dir: Path = None):
        if target_dir is None:
            target_dir = self.ini_path.parent / 'dist'
        if not target_dir.exists():
            target_dir.mkdir(parents=True)
        target = target_dir / '{}-{}.tar.gz'.format(self.metadata.name,
                                                    self.metadata.version)
        tf = tarfile.open(str(target), mode='w:gz')
        tf_dir = '{}-{}'.format(self.metadata.name, self.metadata.version)

        files_to_add = self.find_tracked_files()

        for relpath in files_to_add:
            path = self.srcdir / relpath
            tf.add(str(path), arcname=pjoin(tf_dir, relpath))

        if 'setup.py' in files_to_add:
            log.warning(
                "Using setup.py from repository, not generating setup.py")
        else:
            setup_py = self.make_setup_py()
            log.info("Writing generated setup.py")
            ti = tarfile.TarInfo(pjoin(tf_dir, 'setup.py'))
            ti.size = len(setup_py)
            tf.addfile(ti, io.BytesIO(setup_py))

        pkg_info = PKG_INFO.format(
            name=self.metadata.name,
            version=self.metadata.version,
            summary=self.metadata.summary,
            home_page=self.metadata.home_page,
            author=self.metadata.author,
            author_email=self.metadata.author_email,
        ).encode('utf-8')
        ti = tarfile.TarInfo(pjoin(tf_dir, 'PKG-INFO'))
        ti.size = len(pkg_info)
        tf.addfile(ti, io.BytesIO(pkg_info))

        tf.close()

        log.info("Built sdist: %s", target)
        return target
예제 #49
0
def create_udf_data(con):
    ibis_home = posixpath.dirname(posixpath.dirname(os.path.abspath(__file__)))
    sep = os.sep
    path_list = ibis_home.split(sep)
    path_list += ['testing', 'udf']
    udf_dir = sep.join(path_list)
    build_list = path_list + ['build']
    build_dir = sep.join(build_list)
    subprocess.check_call('cmake . && make', shell=True, cwd=udf_dir)
    so_dir = pjoin(ENV.test_data_dir, 'udf')
    con.hdfs.put(so_dir, build_dir, verbose=True)
예제 #50
0
 def test_get_directory_overwrite_file(self):
     try:
         local_path1 = self._make_test_directory()
         local_path2 = self._make_random_file()
         remote_path = pjoin(self.tmp_dir, local_path1)
         self.hdfs.put(remote_path, local_path1)
         self.hdfs.get(remote_path, local_path2, overwrite=True)
         _check_directories_equal(local_path1, local_path2)
     finally:
         # Path changed from file to directory, must be cleaned manually.
         self._try_delete_directory(local_path2)
예제 #51
0
def test_size(hdfs, tmp_dir):
    test_dir = pjoin(tmp_dir, 'size-test')

    K = 2048
    path = make_random_file(size=K)
    try:
        hdfs_path = pjoin(test_dir, path)
        hdfs.put(hdfs_path, path)
        assert hdfs.size(hdfs_path) == K

        size_test_dir = sample_nested_directory()
        try:
            hdfs_path = pjoin(test_dir, size_test_dir)
            hdfs.put(hdfs_path, size_test_dir)

            assert hdfs.size(hdfs_path) == K * 7
        finally:
            shutil.rmtree(size_test_dir)
    finally:
        os.remove(path)
예제 #52
0
        def _scrape_dir(path, dst):
            objs = self.client.list(path)
            for hpath, detail in objs:
                relpath = posixpath.relpath(hpath, hdfs_path)
                full_opath = pjoin(dst, relpath)

                if detail['type'] == 'FILE':
                    _get_file(hpath, full_opath)
                else:
                    os.makedirs(full_opath)
                    _scrape_dir(hpath, dst)
예제 #53
0
파일: test_ddl.py 프로젝트: jelitox/ibis
def test_temp_table_concurrency(con, test_data_dir):
    def limit_10(i, hdfs_path):
        t = con.parquet_file(hdfs_path)
        return t.sort_by(t.r_regionkey).limit(1, offset=i).execute()

    nthreads = 4
    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')

    with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as e:
        futures = [e.submit(limit_10, i, hdfs_path) for i in range(nthreads)]
    assert all(len(future.result()) for future in futures)
예제 #54
0
파일: sdist.py 프로젝트: sbidoul/flit
    def build(self, target_dir, gen_setup_py=True):
        os.makedirs(str(target_dir), exist_ok=True)
        target = target_dir / '{}-{}.tar.gz'.format(
                self.metadata.name, self.metadata.version
        )
        source_date_epoch = os.environ.get('SOURCE_DATE_EPOCH', '')
        mtime = int(source_date_epoch) if source_date_epoch else None
        gz = GzipFile(str(target), mode='wb', mtime=mtime)
        tf = tarfile.TarFile(str(target), mode='w', fileobj=gz,
                             format=tarfile.PAX_FORMAT)

        try:
            files_to_add = self.apply_includes_excludes(self.select_files())

            for relpath in files_to_add:
                path = str(self.cfgdir / relpath)
                ti = tf.gettarinfo(path, arcname=pjoin(self.dir_name, relpath))
                ti = clean_tarinfo(ti, mtime)

                if ti.isreg():
                    with open(path, 'rb') as f:
                        tf.addfile(ti, f)
                else:
                    tf.addfile(ti)  # Symlinks & ?

            if gen_setup_py:
                self.add_setup_py(files_to_add, tf)

            stream = io.StringIO()
            self.metadata.write_metadata_file(stream)
            pkg_info = stream.getvalue().encode()
            ti = tarfile.TarInfo(pjoin(self.dir_name, 'PKG-INFO'))
            ti.size = len(pkg_info)
            tf.addfile(ti, io.BytesIO(pkg_info))

        finally:
            tf.close()
            gz.close()

        log.info("Built sdist: %s", target)
        return target
예제 #55
0
    def test_load_data_partition(self):
        df = self.df

        unpart_t = self.db.table(self.pd_name)
        part_keys = ['year', 'month']
        part_t = self._create_partitioned_table(unpart_t.schema(),
                                                part_keys)

        # trim the runtime of this test
        df = df[df.month == '1'].reset_index(drop=True)

        unique_keys = df[part_keys].drop_duplicates()

        hdfs_dir = pjoin(self.tmp_dir, 'load-data-partition')

        df2 = df.drop(['year', 'month'], axis='columns')

        csv_props = {
            'serialization.format': ',',
            'field.delim': ','
        }

        for i, (year, month) in enumerate(unique_keys.itertuples(index=False)):
            chunk = df2[(df.year == year) & (df.month == month)]
            chunk_path = pjoin(hdfs_dir, '{0}.csv'.format(i))

            self.con.write_dataframe(chunk, chunk_path)

            # test both styles of insert
            if i:
                part = {'year': year, 'month': month}
            else:
                part = [year, month]

            part_t.add_partition(part)
            part_t.alter_partition(part, format='text',
                                   serde_properties=csv_props)
            part_t.load_data(chunk_path, partition=part)

        self.hdfs.rmdir(hdfs_dir)
        self._verify_partitioned_table(part_t, df, unique_keys)
예제 #56
0
def test_close_drops_temp_tables(con, test_data_dir):
    from posixpath import join as pjoin

    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')

    table = con.parquet_file(hdfs_path)

    name = table.op().name
    assert con.exists_table(name) is True
    con.close()

    assert not con.exists_table(name)
예제 #57
0
 def setUpClass(cls):
     cls.ENV = ENV
     cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid())
     if cls.ENV.auth_mechanism in ['GSSAPI', 'LDAP']:
         print("Warning: ignoring invalid Certificate Authority errors")
     cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host,
                                  port=cls.ENV.webhdfs_port,
                                  auth_mechanism=cls.ENV.auth_mechanism,
                                  verify=(cls.ENV.auth_mechanism
                                          not in ['GSSAPI', 'LDAP']),
                                  user=cls.ENV.webhdfs_user)
     cls.hdfs.mkdir(cls.tmp_dir)
예제 #58
0
    def test_query_parquet_infer_schema(self):
        hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')
        table = self.con.parquet_file(hdfs_path)

        # NOTE: the actual schema should have an int16, but bc this is being
        # inferred from a parquet file, which has no notion of int16, the
        # inferred schema will have an int32 instead.
        ex_schema = ibis.schema([('r_regionkey', 'int32'),
                                 ('r_name', 'string'),
                                 ('r_comment', 'string')])

        assert_equal(table.schema(), ex_schema)
예제 #59
0
 def setUpClass(cls):
     cls.ENV = ENV
     cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid())
     if cls.ENV.use_kerberos:
         print("Warning: ignoring invalid Certificate Authority errors")
     # NOTE: specifying superuser as set in IbisTestEnv
     cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host,
                                  port=cls.ENV.webhdfs_port,
                                  use_kerberos=cls.ENV.use_kerberos,
                                  verify=(not cls.ENV.use_kerberos),
                                  user=cls.ENV.hdfs_superuser)
     cls.hdfs.mkdir(cls.tmp_dir)
예제 #60
0
    def find_packages(cls, include):
        """
        Discover subpackages and data.

        It also retrieves necessary files.
        """
        pkgdir = None
        if include.source is not None:
            pkgdir = str(include.base)

        base = str(include.elements[0].parent)

        pkg_name = include.package
        pkg_data = defaultdict(list)
        # Undocumented distutils feature:
        # the empty string matches all package names
        pkg_data[""].append("*")
        packages = [pkg_name]
        subpkg_paths = set()

        def find_nearest_pkg(rel_path):
            parts = rel_path.split(os.sep)
            for i in reversed(range(1, len(parts))):
                ancestor = "/".join(parts[:i])
                if ancestor in subpkg_paths:
                    pkg = ".".join([pkg_name] + parts[:i])
                    return pkg, "/".join(parts[i:])

            # Relative to the top-level package
            return pkg_name, rel_path

        for path, dirnames, filenames in os.walk(str(base), topdown=True):
            if os.path.basename(path) == "__pycache__":
                continue

            from_top_level = os.path.relpath(path, base)
            if from_top_level == ".":
                continue

            is_subpkg = "__init__.py" in filenames
            if is_subpkg:
                subpkg_paths.add(from_top_level)
                parts = from_top_level.split(os.sep)
                packages.append(".".join([pkg_name] + parts))
            else:
                pkg, from_nearest_pkg = find_nearest_pkg(from_top_level)
                pkg_data[pkg].append(pjoin(from_nearest_pkg, "*"))

        # Sort values in pkg_data
        pkg_data = {k: sorted(v) for (k, v) in pkg_data.items()}

        return pkgdir, sorted(packages), pkg_data