def test_filesystem_from_uri_s3(s3_connection, s3_server): from pyarrow.fs import S3FileSystem host, port, access_key, secret_key = s3_connection uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}" \ .format(access_key, secret_key, host, port) fs, path = FileSystem.from_uri(uri) assert isinstance(fs, S3FileSystem) assert path == "mybucket/foo/bar" fs.create_dir(path) [info] = fs.get_file_info([path]) assert info.path == path assert info.type == FileType.Directory
def get_schema(self, uri: str): fs, base_dir = FileSystem.from_uri(normalize_uri(uri)) selector = FileSelector(base_dir, allow_not_found=True, recursive=True) first_parquet = None for finfo in fs.get_file_info(selector): if finfo.path.endswith(".parquet"): first_parquet = finfo.path break metadata_file = fs.open_input_file(first_parquet) metadata = pq.read_metadata(metadata_file) kv_metadata = metadata.metadata try: return json.loads(kv_metadata[self.SPARK_PARQUET_ROW_METADATA]) except KeyError as exp: raise ValueError( f"Parquet dataset {uri} is not created via Spark") from exp
def __iter__(self): shuffler = RandomShuffler( self.shuffler_capacity if self.shuffle else 1, self.seed ) group_count = -1 for filepath in self.files: fs, path = FileSystem.from_uri(filepath) with fs.open_input_file(path) as fobj: parquet = pg.ParquetFile(fobj) for group_idx in range(parquet.num_row_groups): # A simple form of row-group level bucketing without # memory overhead. # Pros: # - It requires zero communication to initialize the # distributed policy # - It uses little memory and no startup overhead, i.e. # collecting row groups. # Cons: # The drawback would be if the world size is much larger # than the average number of row groups. As a result, # many of the file open operations would be wasted. group_count += 1 if group_count % self.world_size != self.rank: continue row_group = parquet.read_row_group( group_idx, columns=self.columns ) for ( batch ) in row_group.to_batches(): # type: pyarrow.RecordBatch # TODO: read batches not using pandas for _, row in batch.to_pandas().iterrows(): shuffler.append(row) # Maintain the shuffler buffer around its capacity. while shuffler.full(): yield self._convert( shuffler.pop().to_dict(), self.spark_row_metadata, ) while shuffler: yield self._convert( shuffler.pop().to_dict(), self.spark_row_metadata )
def test_open_dataset_from_uri_s3(minio_server): # open dataset from non-localfs string path from pyarrow.fs import FileSystem import pyarrow.parquet as pq address, access_key, secret_key = minio_server uri = "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}" \ .format(access_key, secret_key, urllib.parse.quote(address)) fs, path = FileSystem.from_uri(uri) fs.create_dir("mybucket") table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream("mybucket/data.parquet") as out: pq.write_table(table, out) # full string URI dataset = ds.dataset(uri, format="parquet") assert dataset.to_table().equals(table) # passing filesystem object dataset = ds.dataset(path, format="parquet", filesystem=fs) assert dataset.to_table().equals(table)
def _ensure_fs(fs_or_uri): from pyarrow.fs import ( FileSystem, LocalFileSystem, SubTreeFileSystem, FileType, _ensure_filesystem ) if isinstance(fs_or_uri, str): # instantiate the file system from an uri, if the uri has a path # component then it will be treated as a path prefix filesystem, prefix = FileSystem.from_uri(fs_or_uri) is_local = isinstance(filesystem, LocalFileSystem) prefix = filesystem.normalize_path(prefix) if prefix: # validate that the prefix is pointing to a directory prefix_info = filesystem.get_file_info([prefix])[0] if prefix_info.type != FileType.Directory: raise ValueError( "The path component of the filesystem URI must point to a " "directory but it has a type: `{}`. The path component " "is `{}` and the given filesystem URI is `{}`".format( prefix_info.type.name, prefix_info.path, fs_or_uri ) ) filesystem = SubTreeFileSystem(prefix, filesystem) return filesystem, is_local try: filesystem = _ensure_filesystem(fs_or_uri) except TypeError: raise TypeError( '`filesystem` argument must be a FileSystem instance or a valid ' 'file system URI' ) if isinstance(filesystem, (LocalFileSystem, _MockFileSystem)): return filesystem, True else: return filesystem, False
def test_filesystem_from_uri(uri, expected_klass, expected_path): fs, path = FileSystem.from_uri(uri) assert isinstance(fs, expected_klass) assert path == expected_path
def _ensure_single_source(path, filesystem=None): """ Treat path as either a recursively traversable directory or a single file. Parameters ---------- path : path-like filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str or fs.Selector) File system object and either a single item list pointing to a file or an fs.Selector object pointing to a directory. Raises ------ TypeError If the passed filesystem has wrong type. FileNotFoundError If the referenced file or directory doesn't exist. """ from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector path = _stringify_path(path) # if filesystem is not given try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI file_info = None if filesystem is None: filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info([path])[0] except OSError: file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # ARROW-8213: neither an URI nor a locally existing path, # so assume that local path was given and propagate a nicer # file not found error instead of a more confusing scheme # parsing error if "empty scheme" not in str(e): raise else: # unset file_info to query it again from the new filesystem file_info = None # construct a filesystem if it is a valid URI filesystem, _ = _ensure_fs(filesystem) # ensure that the path is normalized before passing to dataset discovery path = filesystem.normalize_path(path) # retrieve the file descriptor if file_info is None: file_info = filesystem.get_file_info([path])[0] # depending on the path type either return with a recursive # directory selector or as a list containing a single file if file_info.type == FileType.Directory: paths_or_selector = FileSelector(path, recursive=True) elif file_info.type == FileType.File: paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def test_filesystem_from_path_object(path): p = pathlib.Path(path) fs, path = FileSystem.from_uri(p) assert isinstance(fs, LocalFileSystem) assert path == p.resolve().absolute().as_posix()
def __init__(self): self._client, _ = FileSystem.from_uri(Envs.HDFS_SERVER)
def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): from pyarrow.fs import FileSystem import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection bucket = 'theirbucket' path = 'nested/folder/data.parquet' uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, bucket, path, host, port ) fs, path = FileSystem.from_uri(uri) assert path == 'theirbucket/nested/folder/data.parquet' fs.create_dir(bucket) table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream(path) as out: pq.write_table(table, out) # full string URI dataset = ds.dataset(uri, format="parquet") assert dataset.to_table().equals(table) # passing filesystem as an uri template = ( "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, host, port ) ) cases = [ ('theirbucket/nested/folder/', '/data.parquet'), ('theirbucket/nested/folder', 'data.parquet'), ('theirbucket/nested/', 'folder/data.parquet'), ('theirbucket/nested', 'folder/data.parquet'), ('theirbucket', '/nested/folder/data.parquet'), ('theirbucket', 'nested/folder/data.parquet'), ] for prefix, path in cases: uri = template.format(prefix) dataset = ds.dataset(path, filesystem=uri, format="parquet") assert dataset.to_table().equals(table) with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'): uri = template.format('/') ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri) error = ( "The path component of the filesystem URI must point to a directory " "but it has a type: `{}`. The path component is `{}` and the given " "filesystem URI is `{}`" ) path = 'theirbucket/doesnt/exist' uri = template.format(path) with pytest.raises(ValueError) as exc: ds.dataset('data.parquet', filesystem=uri) assert str(exc.value) == error.format('NotFound', path, uri) path = 'theirbucket/nested/folder/data.parquet' uri = template.format(path) with pytest.raises(ValueError) as exc: ds.dataset('data.parquet', filesystem=uri) assert str(exc.value) == error.format('File', path, uri)