Пример #1
0
def main():
    parser = argparse.ArgumentParser(
        description="Generate sample parquet data")
    parser.add_argument('path',
                        type=str,
                        nargs='?',
                        help='path to save data to',
                        default="./data/data.parquet")
    parser.add_argument(
        '--source',
        type=str,
        help=
        'local path to import data from (optional; can be csv, json or parquet)'
    )
    parser.add_argument(
        '--endpoint',
        type=str,
        help=
        'S3 endpoint (e.g.: https://s3.eu-de.cloud-object-storage.appdomain.cloud'
    )
    parser.add_argument('--access_key', type=str, help='S3 access key')
    parser.add_argument('--secret_key', type=str, help='S3 secret key')
    args = parser.parse_args()

    if args.endpoint:
        print("Using S3 file system")
        parsed_endpoint = urlparse(args.endpoint)
        fs = S3FileSystem(endpoint_override=parsed_endpoint.netloc,
                          scheme=parsed_endpoint.scheme,
                          access_key=args.access_key,
                          secret_key=args.secret_key,
                          background_writes=False)
    else:
        print("Using local file system")
        os.makedirs(os.path.dirname(args.path), exist_ok=True)
        fs = LocalFileSystem()

    table = import_table(args.source)

    with fs.open_output_stream(args.path) as f:
        pq.write_table(table, f)
    print("Table written to", args.path)
    print(table.to_pandas())
Пример #2
0
class FakeHadoopFileSystem:
    def __init__(self, *args, **kwargs):
        from pyarrow.fs import LocalFileSystem

        self._root = Path(_hdfs_root.name)
        self._fs = LocalFileSystem()

    def _path(self, path):
        from pyarrow.fs import FileSelector

        if isinstance(path, FileSelector):
            return FileSelector(
                os.fspath(self._root / path.base_dir.lstrip("/")),
                path.allow_not_found,
                path.recursive,
            )

        return os.fspath(self._root / path.lstrip("/"))

    def create_dir(self, path):
        return self._fs.create_dir(self._path(path))

    def open_input_stream(self, path):
        return self._fs.open_input_stream(self._path(path))

    def open_output_stream(self, path):
        import posixpath

        # NOTE: HadoopFileSystem.open_output_stream creates directories
        # automatically.
        self.create_dir(posixpath.dirname(path))
        return self._fs.open_output_stream(self._path(path))

    def get_file_info(self, path):
        return self._fs.get_file_info(self._path(path))

    def move(self, from_path, to_path):
        self._fs.move(self._path(from_path), self._path(to_path))

    def delete_file(self, path):
        self._fs.delete_file(self._path(path))
Пример #3
0
class FakeHadoopFileSystem:
    def __init__(self, *args, **kwargs):
        from pyarrow.fs import LocalFileSystem

        self._root = Path(_hdfs_root.name)
        self._fs = LocalFileSystem()

    def _path(self, path):
        from pyarrow.fs import FileSelector

        if isinstance(path, FileSelector):
            return FileSelector(
                os.fspath(self._root / path.base_dir.lstrip("/")),
                path.allow_not_found,
                path.recursive,
            )
        if isinstance(path, list):
            return [self._path(sub_path) for sub_path in path]

        return os.fspath(self._root / path.lstrip("/"))

    def create_dir(self, path, **kwargs):
        return self._fs.create_dir(self._path(path), **kwargs)

    def open_input_stream(self, path, **kwargs):
        return self._fs.open_input_stream(self._path(path), **kwargs)

    def open_output_stream(self, path, **kwargs):
        import posixpath

        # NOTE: HadoopFileSystem.open_output_stream creates directories
        # automatically.
        self.create_dir(posixpath.dirname(path))
        return self._fs.open_output_stream(self._path(path), **kwargs)

    def get_file_info(self, path, **kwargs):
        from pyarrow.fs import FileInfo

        entries = self._fs.get_file_info(self._path(path), **kwargs)
        if isinstance(entries, FileInfo):
            ret = self._adjust_entry(entries)
        else:
            assert isinstance(entries, list)
            ret = list(map(self._adjust_entry, entries))

        #        import pdb; pdb.set_trace()

        return ret

    def _adjust_entry(self, entry):
        import posixpath

        from pyarrow.fs import FileInfo

        mocked_path = os.path.relpath(entry.path, self._root)
        mocked_parts = mocked_path.split(os.path.sep)
        return FileInfo(
            path=posixpath.join(*mocked_parts),
            type=entry.type,
            mtime=entry.mtime,
            size=entry.size,
        )

    def move(self, from_path, to_path):
        self._fs.move(self._path(from_path), self._path(to_path))

    def delete_file(self, path):
        self._fs.delete_file(self._path(path))