def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1): partitions = None common_metadata_path = None metadata_path = None if isinstance(path_or_paths, list) and len(path_or_paths) == 1: # Dask passes a directory as a list of length 1 path_or_paths = path_or_paths[0] if _is_path_like(path_or_paths) and fs.isdir(path_or_paths): manifest = ParquetManifest(path_or_paths, filesystem=fs, pathsep=fs.pathsep, metadata_nthreads=metadata_nthreads) common_metadata_path = manifest.common_metadata_path metadata_path = manifest.metadata_path pieces = manifest.pieces partitions = manifest.partitions else: if not isinstance(path_or_paths, list): path_or_paths = [path_or_paths] # List of paths if len(path_or_paths) == 0: raise ValueError('Must pass at least one file path') pieces = [] for path in path_or_paths: if not fs.isfile(path): raise IOError('Passed non-file path: {0}'.format(path)) piece = ParquetDatasetPiece(path) pieces.append(piece) return pieces, partitions, common_metadata_path, metadata_path
def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=True, compression='snappy', use_deprecated_int96_timestamps=None, coerce_timestamps=None, allow_truncated_timestamps=False, flavor=None, **kwargs): row_group_size = kwargs.pop('chunk_size', row_group_size) use_int96 = use_deprecated_int96_timestamps try: with ParquetWriter( where, table.schema, version=version, flavor=flavor, use_dictionary=use_dictionary, coerce_timestamps=coerce_timestamps, allow_truncated_timestamps=allow_truncated_timestamps, compression=compression, use_deprecated_int96_timestamps=use_int96, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: if _is_path_like(where): try: os.remove(_stringify_path(where)) except os.error: pass raise
def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=True, compression='snappy', use_deprecated_int96_timestamps=None, coerce_timestamps=None, allow_truncated_timestamps=False, flavor=None, filesystem=None, **kwargs): row_group_size = kwargs.pop('chunk_size', row_group_size) use_int96 = use_deprecated_int96_timestamps try: with ParquetWriter( where, table.schema, filesystem=filesystem, version=version, flavor=flavor, use_dictionary=use_dictionary, coerce_timestamps=coerce_timestamps, allow_truncated_timestamps=allow_truncated_timestamps, compression=compression, use_deprecated_int96_timestamps=use_int96, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: if _is_path_like(where): try: os.remove(_stringify_path(where)) except os.error: pass raise
def resolve_filesystem_and_path(where, filesystem=None): """ return filesystem from path which could be an HDFS URI """ if not _is_path_like(where): if filesystem is not None: raise ValueError("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") return filesystem, where # input can be hdfs URI such as hdfs://host:port/myfile.parquet path = _stringify_path(where) if filesystem is not None: return _ensure_filesystem(filesystem), path parsed_uri = urlparse(path) if parsed_uri.scheme == 'hdfs': netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs.connect(host=host, port=port) else: fs = LocalFileSystem.get_instance() return fs, parsed_uri.path
def _resolve_filesystem_and_path( path, filesystem=None, allow_legacy_filesystem=False ): """ Return filesystem/path from path which could be an URI or a plain filesystem path. """ if not _is_path_like(path): if filesystem is not None: raise ValueError( "'filesystem' passed but the specified path is file-like, so" " there is nothing to open with 'filesystem'." ) return filesystem, path if filesystem is not None: filesystem = _ensure_filesystem( filesystem, allow_legacy_filesystem=allow_legacy_filesystem ) if isinstance(filesystem, LocalFileSystem): path = _stringify_path(path) elif not isinstance(path, str): raise TypeError( "Expected string path; path-like objects are only allowed " "with a local filesystem" ) if not allow_legacy_filesystem: path = filesystem.normalize_path(path) return filesystem, path path = _stringify_path(path) # if filesystem is not given, try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info(path) except ValueError: # ValueError means path is likely an URI file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # neither an URI nor a locally existing path, so assume that # local path was given and propagate a nicer file not found error # instead of a more confusing scheme parsing error if "empty scheme" not in str(e): raise else: path = filesystem.normalize_path(path) return filesystem, path
def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=True): if _is_path_like(source): fs = _get_fs_from_path(source) return fs.read_parquet(source, columns=columns, use_threads=use_threads, metadata=metadata, use_pandas_metadata=use_pandas_metadata) pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata)
def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=True, filesystem=None): if _is_path_like(source): fs, path = _get_filesystem_and_path(filesystem, source) return fs.read_parquet(path, columns=columns, use_threads=use_threads, metadata=metadata, use_pandas_metadata=use_pandas_metadata) pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata)
def resolve_filesystem_and_path(where, filesystem=None): """ Return filesystem from path which could be an HDFS URI, a local URI, or a plain filesystem path. """ if not _is_path_like(where): if filesystem is not None: raise ValueError("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") return filesystem, where if filesystem is not None: filesystem = _ensure_filesystem(filesystem) if isinstance(filesystem, LocalFileSystem): path = _stringify_path(where) elif not isinstance(where, str): raise TypeError( "Expected string path; path-like objects are only allowed " "with a local filesystem" ) else: path = where return filesystem, path path = _stringify_path(where) parsed_uri = urllib.parse.urlparse(path) if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs': # Input is hdfs URI such as hdfs://host:port/myfile.parquet netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' else: host = parsed_uri.scheme + "://" + host port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs._connect(host=host, port=port) fs_path = parsed_uri.path elif parsed_uri.scheme == 'file': # Input is local URI such as file:///home/user/myfile.parquet fs = LocalFileSystem._get_instance() fs_path = parsed_uri.path else: # Input is local path such as /home/user/myfile.parquet fs = LocalFileSystem._get_instance() fs_path = path return fs, fs_path
def __init__(self, where, schema, flavor=None, version='1.0', use_dictionary=True, compression='snappy', use_deprecated_int96_timestamps=None, filesystem=None, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark if flavor is not None and 'spark' in flavor: use_deprecated_int96_timestamps = True else: use_deprecated_int96_timestamps = False self.flavor = flavor if flavor is not None: schema, self.schema_changed = _sanitize_schema(schema, flavor) else: self.schema_changed = False self.schema = schema self.where = where # If we open a file using an implied filesystem, so it can be assured # to be closed self.file_handle = None if _is_path_like(where): fs, path = _get_filesystem_and_path(filesystem, where) sink = self.file_handle = fs.open(path, 'wb') else: sink = where self.writer = _parquet.ParquetWriter( sink, schema, version=version, compression=compression, use_dictionary=use_dictionary, use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, **options) self.is_open = True
def _ensure_source(src, filesystem=None, partitioning=None, format=None): if _is_path_like(src): src = source(src, filesystem=filesystem, partitioning=partitioning, format=format) # TODO also accept Source? elif isinstance(src, FileSystemSourceFactory): # when passing a SourceFactory, the arguments cannot be specified if any(kwarg is not None for kwarg in [filesystem, partitioning, format]): raise ValueError( "When passing a Source(Factory), you cannot pass any " "additional arguments") else: raise ValueError("Expected a path-like or Source, got {0}".format( type(src))) return src
def _ensure_factory(src, **kwargs): # Need to return DatasetFactory since `dataset` might need to finish the # factory with a unified schema. # TODO: return Dataset if a specific schema was passed? if _is_path_like(src): return factory(src, **kwargs) elif isinstance(src, DatasetFactory): if any(v is not None for v in kwargs.values()): # when passing a SourceFactory, the arguments cannot be specified raise ValueError( "When passing a DatasetFactory, you cannot pass any " "additional arguments") return src elif isinstance(src, Dataset): raise TypeError( "Dataset objects are currently not supported, only DatasetFactory " "instances. Use the factory() function to create such objects.") else: raise TypeError( "Expected a path-like or DatasetFactory, got {}".format(type(src)))
def _resolve_filesystem_and_path(path, filesystem=None, allow_legacy_filesystem=False): """ Return filesystem/path from path which could be an URI or a plain filesystem path. """ if not _is_path_like(path): if filesystem is not None: raise ValueError( "'filesystem' passed but the specified path is file-like, so" " there is nothing to open with 'filesystem'.") return filesystem, path path = _stringify_path(path) if filesystem is not None: filesystem = _ensure_filesystem( filesystem, allow_legacy_filesystem=allow_legacy_filesystem) return filesystem, path else: return FileSystem.from_uri(path)
def resolve_filesystem_and_path(where, filesystem=None): """ Return filesystem from path which could be an HDFS URI, a local URI, or a plain filesystem path. """ if not _is_path_like(where): if filesystem is not None: raise ValueError("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") return filesystem, where path = _stringify_path(where) if filesystem is not None: return _ensure_filesystem(filesystem), path parsed_uri = urlparse(path) if parsed_uri.scheme == 'hdfs': # Input is hdfs URI such as hdfs://host:port/myfile.parquet netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs.connect(host=host, port=port) fs_path = parsed_uri.path elif parsed_uri.scheme == 'file': # Input is local URI such as file:///home/user/myfile.parquet fs = LocalFileSystem.get_instance() fs_path = parsed_uri.path else: # Input is local path such as /home/user/myfile.parquet fs = LocalFileSystem.get_instance() fs_path = where return fs, fs_path
def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, open_file_func=None): partitions = None common_metadata_path = None metadata_path = None if isinstance(path_or_paths, list) and len(path_or_paths) == 1: # Dask passes a directory as a list of length 1 path_or_paths = path_or_paths[0] if _is_path_like(path_or_paths) and fs.isdir(path_or_paths): manifest = ParquetManifest(path_or_paths, filesystem=fs, open_file_func=open_file_func, pathsep=fs.pathsep, metadata_nthreads=metadata_nthreads) common_metadata_path = manifest.common_metadata_path metadata_path = manifest.metadata_path pieces = manifest.pieces partitions = manifest.partitions else: if not isinstance(path_or_paths, list): path_or_paths = [path_or_paths] # List of paths if len(path_or_paths) == 0: raise ValueError('Must pass at least one file path') pieces = [] for path in path_or_paths: if not fs.isfile(path): raise IOError('Passed non-file path: {0}' .format(path)) piece = ParquetDatasetPiece(path, open_file_func=open_file_func) pieces.append(piece) return pieces, partitions, common_metadata_path, metadata_path
def dataset(source, schema=None, format=None, filesystem=None, partitioning=None, partition_base_dir=None, exclude_invalid_files=None, ignore_prefixes=None): """ Open a dataset. Datasets provides functionality to efficiently work with tabular, potentially larger than memory and multi-file dataset. - A unified interface for different sources, like Parquet and Feather - Discovery of sources (crawling directories, handle directory-based partitioned datasets, basic schema normalization) - Optimized reading with predicate pushdown (filtering rows), projection (selecting columns), parallel reading or fine-grained managing of tasks. Note that this is the high-level API, to have more control over the dataset construction use the low-level API classes (FileSystemDataset, FilesystemDatasetFactory, etc.) Parameters ---------- source : path, list of paths, dataset, list of datasets, (list of) batches\ or tables, iterable of batches, RecordBatchReader, or URI Path pointing to a single file: Open a FileSystemDataset from a single file. Path pointing to a directory: The directory gets discovered recursively according to a partitioning scheme if given. List of file paths: Create a FileSystemDataset from explicitly given files. The files must be located on the same filesystem given by the filesystem parameter. Note that in contrary of construction from a single file, passing URIs as paths is not allowed. List of datasets: A nested UnionDataset gets constructed, it allows arbitrary composition of other datasets. Note that additional keyword arguments are not allowed. (List of) batches or tables, iterable of batches, or RecordBatchReader: Create an InMemoryDataset. If an iterable or empty list is given, a schema must also be given. If an iterable or RecordBatchReader is given, the resulting dataset can only be scanned once; further attempts will raise an error. schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. format : FileFormat or str Currently "parquet" and "ipc"/"arrow"/"feather" are supported. For Feather, only version 2 files are supported. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. exclude_invalid_files : bool, optional (default True) If True, invalid files will be excluded (file format specific check). This will incur IO for each files in a serial and single threaded fashion. Disabling this feature will skip the IO, but unsupported files may be present in the Dataset (resulting in an error at scan time). ignore_prefixes : list, optional Files matching any of these prefixes will be ignored by the discovery process. This is matched to the basename of a path. By default this is ['.', '_']. Note that discovery happens only if a directory is passed as source. Returns ------- dataset : Dataset Either a FileSystemDataset or a UnionDataset depending on the source parameter. Examples -------- Opening a single file: >>> dataset("path/to/file.parquet", format="parquet") Opening a single file with an explicit schema: >>> dataset("path/to/file.parquet", schema=myschema, format="parquet") Opening a dataset for a single directory: >>> dataset("path/to/nyc-taxi/", format="parquet") >>> dataset("s3://mybucket/nyc-taxi/", format="parquet") Opening a dataset from a list of relatives local paths: >>> dataset([ ... "part0/data.parquet", ... "part1/data.parquet", ... "part3/data.parquet", ... ], format='parquet') With filesystem provided: >>> paths = [ ... 'part0/data.parquet', ... 'part1/data.parquet', ... 'part3/data.parquet', ... ] >>> dataset(paths, filesystem='file:///directory/prefix, format='parquet') Which is equivalent with: >>> fs = SubTreeFileSystem("/directory/prefix", LocalFileSystem()) >>> dataset(paths, filesystem=fs, format='parquet') With a remote filesystem URI: >>> paths = [ ... 'nested/directory/part0/data.parquet', ... 'nested/directory/part1/data.parquet', ... 'nested/directory/part3/data.parquet', ... ] >>> dataset(paths, filesystem='s3://bucket/', format='parquet') Similarly to the local example, the directory prefix may be included in the filesystem URI: >>> dataset(paths, filesystem='s3://bucket/nested/directory', ... format='parquet') Construction of a nested dataset: >>> dataset([ ... dataset("s3://old-taxi-data", format="parquet"), ... dataset("local/path/to/data", format="ipc") ... ]) """ # collect the keyword arguments for later reuse kwargs = dict(schema=schema, filesystem=filesystem, partitioning=partitioning, format=format, partition_base_dir=partition_base_dir, exclude_invalid_files=exclude_invalid_files, selector_ignore_prefixes=ignore_prefixes) if _is_path_like(source): return _filesystem_dataset(source, **kwargs) elif isinstance(source, (tuple, list)): if all(_is_path_like(elem) for elem in source): return _filesystem_dataset(source, **kwargs) elif all(isinstance(elem, Dataset) for elem in source): return _union_dataset(source, **kwargs) elif all( isinstance(elem, (pa.RecordBatch, pa.Table)) for elem in source): return _in_memory_dataset(source, **kwargs) else: unique_types = set(type(elem).__name__ for elem in source) type_names = ', '.join('{}'.format(t) for t in unique_types) raise TypeError( 'Expected a list of path-like or dataset objects, or a list ' 'of batches or tables. The given list contains the following ' 'types: {}'.format(type_names)) elif isinstance(source, (pa.RecordBatch, pa.Table)): return _in_memory_dataset(source, **kwargs) else: raise TypeError( 'Expected a path-like, list of path-likes or a list of Datasets ' 'instead of the given type: {}'.format(type(source).__name__))