示例#1
0
def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1):
    partitions = None
    common_metadata_path = None
    metadata_path = None

    if isinstance(path_or_paths, list) and len(path_or_paths) == 1:
        # Dask passes a directory as a list of length 1
        path_or_paths = path_or_paths[0]

    if _is_path_like(path_or_paths) and fs.isdir(path_or_paths):
        manifest = ParquetManifest(path_or_paths,
                                   filesystem=fs,
                                   pathsep=fs.pathsep,
                                   metadata_nthreads=metadata_nthreads)
        common_metadata_path = manifest.common_metadata_path
        metadata_path = manifest.metadata_path
        pieces = manifest.pieces
        partitions = manifest.partitions
    else:
        if not isinstance(path_or_paths, list):
            path_or_paths = [path_or_paths]

        # List of paths
        if len(path_or_paths) == 0:
            raise ValueError('Must pass at least one file path')

        pieces = []
        for path in path_or_paths:
            if not fs.isfile(path):
                raise IOError('Passed non-file path: {0}'.format(path))
            piece = ParquetDatasetPiece(path)
            pieces.append(piece)

    return pieces, partitions, common_metadata_path, metadata_path
示例#2
0
def write_table(table,
                where,
                row_group_size=None,
                version='1.0',
                use_dictionary=True,
                compression='snappy',
                use_deprecated_int96_timestamps=None,
                coerce_timestamps=None,
                allow_truncated_timestamps=False,
                flavor=None,
                **kwargs):
    row_group_size = kwargs.pop('chunk_size', row_group_size)
    use_int96 = use_deprecated_int96_timestamps
    try:
        with ParquetWriter(
                where,
                table.schema,
                version=version,
                flavor=flavor,
                use_dictionary=use_dictionary,
                coerce_timestamps=coerce_timestamps,
                allow_truncated_timestamps=allow_truncated_timestamps,
                compression=compression,
                use_deprecated_int96_timestamps=use_int96,
                **kwargs) as writer:
            writer.write_table(table, row_group_size=row_group_size)
    except Exception:
        if _is_path_like(where):
            try:
                os.remove(_stringify_path(where))
            except os.error:
                pass
        raise
示例#3
0
def write_table(table, where, row_group_size=None, version='1.0',
                use_dictionary=True, compression='snappy',
                use_deprecated_int96_timestamps=None,
                coerce_timestamps=None,
                allow_truncated_timestamps=False,
                flavor=None, filesystem=None, **kwargs):
    row_group_size = kwargs.pop('chunk_size', row_group_size)
    use_int96 = use_deprecated_int96_timestamps
    try:
        with ParquetWriter(
                where, table.schema,
                filesystem=filesystem,
                version=version,
                flavor=flavor,
                use_dictionary=use_dictionary,
                coerce_timestamps=coerce_timestamps,
                allow_truncated_timestamps=allow_truncated_timestamps,
                compression=compression,
                use_deprecated_int96_timestamps=use_int96,
                **kwargs) as writer:
            writer.write_table(table, row_group_size=row_group_size)
    except Exception:
        if _is_path_like(where):
            try:
                os.remove(_stringify_path(where))
            except os.error:
                pass
        raise
示例#4
0
def resolve_filesystem_and_path(where, filesystem=None):
    """
    return filesystem from path which could be an HDFS URI
    """
    if not _is_path_like(where):
        if filesystem is not None:
            raise ValueError("filesystem passed but where is file-like, so"
                             " there is nothing to open with filesystem.")
        return filesystem, where

    # input can be hdfs URI such as hdfs://host:port/myfile.parquet
    path = _stringify_path(where)

    if filesystem is not None:
        return _ensure_filesystem(filesystem), path

    parsed_uri = urlparse(path)
    if parsed_uri.scheme == 'hdfs':
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])
        fs = pa.hdfs.connect(host=host, port=port)
    else:
        fs = LocalFileSystem.get_instance()

    return fs, parsed_uri.path
示例#5
0
文件: fs.py 项目: tallamjr/arrow
def _resolve_filesystem_and_path(
    path, filesystem=None, allow_legacy_filesystem=False
):
    """
    Return filesystem/path from path which could be an URI or a plain
    filesystem path.
    """
    if not _is_path_like(path):
        if filesystem is not None:
            raise ValueError(
                "'filesystem' passed but the specified path is file-like, so"
                " there is nothing to open with 'filesystem'."
            )
        return filesystem, path

    if filesystem is not None:
        filesystem = _ensure_filesystem(
            filesystem, allow_legacy_filesystem=allow_legacy_filesystem
        )
        if isinstance(filesystem, LocalFileSystem):
            path = _stringify_path(path)
        elif not isinstance(path, str):
            raise TypeError(
                "Expected string path; path-like objects are only allowed "
                "with a local filesystem"
            )
        if not allow_legacy_filesystem:
            path = filesystem.normalize_path(path)
        return filesystem, path

    path = _stringify_path(path)

    # if filesystem is not given, try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    filesystem = LocalFileSystem()
    try:
        file_info = filesystem.get_file_info(path)
    except ValueError:  # ValueError means path is likely an URI
        file_info = None
        exists_locally = False
    else:
        exists_locally = (file_info.type != FileType.NotFound)

    # if the file or directory doesn't exists locally, then assume that
    # the path is an URI describing the file system as well
    if not exists_locally:
        try:
            filesystem, path = FileSystem.from_uri(path)
        except ValueError as e:
            # neither an URI nor a locally existing path, so assume that
            # local path was given and propagate a nicer file not found error
            # instead of a more confusing scheme parsing error
            if "empty scheme" not in str(e):
                raise
    else:
        path = filesystem.normalize_path(path)

    return filesystem, path
示例#6
0
def read_table(source, columns=None, use_threads=True, metadata=None,
               use_pandas_metadata=False, memory_map=True):
    if _is_path_like(source):
        fs = _get_fs_from_path(source)
        return fs.read_parquet(source, columns=columns,
                               use_threads=use_threads, metadata=metadata,
                               use_pandas_metadata=use_pandas_metadata)

    pf = ParquetFile(source, metadata=metadata)
    return pf.read(columns=columns, use_threads=use_threads,
                   use_pandas_metadata=use_pandas_metadata)
示例#7
0
def read_table(source, columns=None, use_threads=True, metadata=None,
               use_pandas_metadata=False, memory_map=True,
               filesystem=None):
    if _is_path_like(source):
        fs, path = _get_filesystem_and_path(filesystem, source)
        return fs.read_parquet(path, columns=columns,
                               use_threads=use_threads, metadata=metadata,
                               use_pandas_metadata=use_pandas_metadata)

    pf = ParquetFile(source, metadata=metadata)
    return pf.read(columns=columns, use_threads=use_threads,
                   use_pandas_metadata=use_pandas_metadata)
示例#8
0
def read_table(source, columns=None, use_threads=True, metadata=None,
               use_pandas_metadata=False, memory_map=True,
               filesystem=None):
    if _is_path_like(source):
        fs, path = _get_filesystem_and_path(filesystem, source)
        return fs.read_parquet(path, columns=columns,
                               use_threads=use_threads, metadata=metadata,
                               use_pandas_metadata=use_pandas_metadata)

    pf = ParquetFile(source, metadata=metadata)
    return pf.read(columns=columns, use_threads=use_threads,
                   use_pandas_metadata=use_pandas_metadata)
def resolve_filesystem_and_path(where, filesystem=None):
    """
    Return filesystem from path which could be an HDFS URI, a local URI,
    or a plain filesystem path.
    """
    if not _is_path_like(where):
        if filesystem is not None:
            raise ValueError("filesystem passed but where is file-like, so"
                             " there is nothing to open with filesystem.")
        return filesystem, where

    if filesystem is not None:
        filesystem = _ensure_filesystem(filesystem)
        if isinstance(filesystem, LocalFileSystem):
            path = _stringify_path(where)
        elif not isinstance(where, str):
            raise TypeError(
                "Expected string path; path-like objects are only allowed "
                "with a local filesystem"
            )
        else:
            path = where
        return filesystem, path

    path = _stringify_path(where)

    parsed_uri = urllib.parse.urlparse(path)
    if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs':
        # Input is hdfs URI such as hdfs://host:port/myfile.parquet
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        else:
            host = parsed_uri.scheme + "://" + host
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])
        fs = pa.hdfs._connect(host=host, port=port)
        fs_path = parsed_uri.path
    elif parsed_uri.scheme == 'file':
        # Input is local URI such as file:///home/user/myfile.parquet
        fs = LocalFileSystem._get_instance()
        fs_path = parsed_uri.path
    else:
        # Input is local path such as /home/user/myfile.parquet
        fs = LocalFileSystem._get_instance()
        fs_path = path

    return fs, fs_path
示例#10
0
    def __init__(self,
                 where,
                 schema,
                 flavor=None,
                 version='1.0',
                 use_dictionary=True,
                 compression='snappy',
                 use_deprecated_int96_timestamps=None,
                 filesystem=None,
                 **options):
        if use_deprecated_int96_timestamps is None:
            # Use int96 timestamps for Spark
            if flavor is not None and 'spark' in flavor:
                use_deprecated_int96_timestamps = True
            else:
                use_deprecated_int96_timestamps = False

        self.flavor = flavor
        if flavor is not None:
            schema, self.schema_changed = _sanitize_schema(schema, flavor)
        else:
            self.schema_changed = False

        self.schema = schema
        self.where = where

        # If we open a file using an implied filesystem, so it can be assured
        # to be closed
        self.file_handle = None

        if _is_path_like(where):
            fs, path = _get_filesystem_and_path(filesystem, where)
            sink = self.file_handle = fs.open(path, 'wb')
        else:
            sink = where

        self.writer = _parquet.ParquetWriter(
            sink,
            schema,
            version=version,
            compression=compression,
            use_dictionary=use_dictionary,
            use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
            **options)
        self.is_open = True
示例#11
0
def _ensure_source(src, filesystem=None, partitioning=None, format=None):
    if _is_path_like(src):
        src = source(src,
                     filesystem=filesystem,
                     partitioning=partitioning,
                     format=format)
    # TODO also accept Source?
    elif isinstance(src, FileSystemSourceFactory):
        # when passing a SourceFactory, the arguments cannot be specified
        if any(kwarg is not None
               for kwarg in [filesystem, partitioning, format]):
            raise ValueError(
                "When passing a Source(Factory), you cannot pass any "
                "additional arguments")
    else:
        raise ValueError("Expected a path-like or Source, got {0}".format(
            type(src)))
    return src
示例#12
0
文件: dataset.py 项目: techfoxy/arrow
def _ensure_factory(src, **kwargs):
    # Need to return DatasetFactory since `dataset` might need to finish the
    # factory with a unified schema.
    # TODO: return Dataset if a specific schema was passed?
    if _is_path_like(src):
        return factory(src, **kwargs)
    elif isinstance(src, DatasetFactory):
        if any(v is not None for v in kwargs.values()):
            # when passing a SourceFactory, the arguments cannot be specified
            raise ValueError(
                "When passing a DatasetFactory, you cannot pass any "
                "additional arguments")
        return src
    elif isinstance(src, Dataset):
        raise TypeError(
            "Dataset objects are currently not supported, only DatasetFactory "
            "instances. Use the factory() function to create such objects.")
    else:
        raise TypeError(
            "Expected a path-like or DatasetFactory, got {}".format(type(src)))
示例#13
0
def _resolve_filesystem_and_path(path,
                                 filesystem=None,
                                 allow_legacy_filesystem=False):
    """
    Return filesystem/path from path which could be an URI or a plain
    filesystem path.
    """
    if not _is_path_like(path):
        if filesystem is not None:
            raise ValueError(
                "'filesystem' passed but the specified path is file-like, so"
                " there is nothing to open with 'filesystem'.")
        return filesystem, path

    path = _stringify_path(path)

    if filesystem is not None:
        filesystem = _ensure_filesystem(
            filesystem, allow_legacy_filesystem=allow_legacy_filesystem)
        return filesystem, path
    else:
        return FileSystem.from_uri(path)
示例#14
0
文件: filesystem.py 项目: rok/arrow
def resolve_filesystem_and_path(where, filesystem=None):
    """
    Return filesystem from path which could be an HDFS URI, a local URI,
    or a plain filesystem path.
    """
    if not _is_path_like(where):
        if filesystem is not None:
            raise ValueError("filesystem passed but where is file-like, so"
                             " there is nothing to open with filesystem.")
        return filesystem, where

    path = _stringify_path(where)

    if filesystem is not None:
        return _ensure_filesystem(filesystem), path

    parsed_uri = urlparse(path)
    if parsed_uri.scheme == 'hdfs':
        # Input is hdfs URI such as hdfs://host:port/myfile.parquet
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])
        fs = pa.hdfs.connect(host=host, port=port)
        fs_path = parsed_uri.path
    elif parsed_uri.scheme == 'file':
        # Input is local URI such as file:///home/user/myfile.parquet
        fs = LocalFileSystem.get_instance()
        fs_path = parsed_uri.path
    else:
        # Input is local path such as /home/user/myfile.parquet
        fs = LocalFileSystem.get_instance()
        fs_path = where

    return fs, fs_path
示例#15
0
def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1,
                   open_file_func=None):
    partitions = None
    common_metadata_path = None
    metadata_path = None

    if isinstance(path_or_paths, list) and len(path_or_paths) == 1:
        # Dask passes a directory as a list of length 1
        path_or_paths = path_or_paths[0]

    if _is_path_like(path_or_paths) and fs.isdir(path_or_paths):
        manifest = ParquetManifest(path_or_paths, filesystem=fs,
                                   open_file_func=open_file_func,
                                   pathsep=fs.pathsep,
                                   metadata_nthreads=metadata_nthreads)
        common_metadata_path = manifest.common_metadata_path
        metadata_path = manifest.metadata_path
        pieces = manifest.pieces
        partitions = manifest.partitions
    else:
        if not isinstance(path_or_paths, list):
            path_or_paths = [path_or_paths]

        # List of paths
        if len(path_or_paths) == 0:
            raise ValueError('Must pass at least one file path')

        pieces = []
        for path in path_or_paths:
            if not fs.isfile(path):
                raise IOError('Passed non-file path: {0}'
                              .format(path))
            piece = ParquetDatasetPiece(path, open_file_func=open_file_func)
            pieces.append(piece)

    return pieces, partitions, common_metadata_path, metadata_path
示例#16
0
def dataset(source,
            schema=None,
            format=None,
            filesystem=None,
            partitioning=None,
            partition_base_dir=None,
            exclude_invalid_files=None,
            ignore_prefixes=None):
    """
    Open a dataset.

    Datasets provides functionality to efficiently work with tabular,
    potentially larger than memory and multi-file dataset.

    - A unified interface for different sources, like Parquet and Feather
    - Discovery of sources (crawling directories, handle directory-based
      partitioned datasets, basic schema normalization)
    - Optimized reading with predicate pushdown (filtering rows), projection
      (selecting columns), parallel reading or fine-grained managing of tasks.

    Note that this is the high-level API, to have more control over the dataset
    construction use the low-level API classes (FileSystemDataset,
    FilesystemDatasetFactory, etc.)

    Parameters
    ----------
    source : path, list of paths, dataset, list of datasets, (list of) batches\
or tables, iterable of batches, RecordBatchReader, or URI
        Path pointing to a single file:
            Open a FileSystemDataset from a single file.
        Path pointing to a directory:
            The directory gets discovered recursively according to a
            partitioning scheme if given.
        List of file paths:
            Create a FileSystemDataset from explicitly given files. The files
            must be located on the same filesystem given by the filesystem
            parameter.
            Note that in contrary of construction from a single file, passing
            URIs as paths is not allowed.
        List of datasets:
            A nested UnionDataset gets constructed, it allows arbitrary
            composition of other datasets.
            Note that additional keyword arguments are not allowed.
        (List of) batches or tables, iterable of batches, or RecordBatchReader:
            Create an InMemoryDataset. If an iterable or empty list is given,
            a schema must also be given. If an iterable or RecordBatchReader
            is given, the resulting dataset can only be scanned once; further
            attempts will raise an error.
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    format : FileFormat or str
        Currently "parquet" and "ipc"/"arrow"/"feather" are supported. For
        Feather, only version 2 files are supported.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    partitioning : Partitioning, PartitioningFactory, str, list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    partition_base_dir : str, optional
        For the purposes of applying the partitioning, paths will be
        stripped of the partition_base_dir. Files not matching the
        partition_base_dir prefix will be skipped for partitioning discovery.
        The ignored files will still be part of the Dataset, but will not
        have partition information.
    exclude_invalid_files : bool, optional (default True)
        If True, invalid files will be excluded (file format specific check).
        This will incur IO for each files in a serial and single threaded
        fashion. Disabling this feature will skip the IO, but unsupported
        files may be present in the Dataset (resulting in an error at scan
        time).
    ignore_prefixes : list, optional
        Files matching any of these prefixes will be ignored by the
        discovery process. This is matched to the basename of a path.
        By default this is ['.', '_'].
        Note that discovery happens only if a directory is passed as source.

    Returns
    -------
    dataset : Dataset
        Either a FileSystemDataset or a UnionDataset depending on the source
        parameter.

    Examples
    --------
    Opening a single file:

    >>> dataset("path/to/file.parquet", format="parquet")

    Opening a single file with an explicit schema:

    >>> dataset("path/to/file.parquet", schema=myschema, format="parquet")

    Opening a dataset for a single directory:

    >>> dataset("path/to/nyc-taxi/", format="parquet")
    >>> dataset("s3://mybucket/nyc-taxi/", format="parquet")

    Opening a dataset from a list of relatives local paths:

    >>> dataset([
    ...     "part0/data.parquet",
    ...     "part1/data.parquet",
    ...     "part3/data.parquet",
    ... ], format='parquet')

    With filesystem provided:

    >>> paths = [
    ...     'part0/data.parquet',
    ...     'part1/data.parquet',
    ...     'part3/data.parquet',
    ... ]
    >>> dataset(paths, filesystem='file:///directory/prefix, format='parquet')

    Which is equivalent with:

    >>> fs = SubTreeFileSystem("/directory/prefix", LocalFileSystem())
    >>> dataset(paths, filesystem=fs, format='parquet')

    With a remote filesystem URI:

    >>> paths = [
    ...     'nested/directory/part0/data.parquet',
    ...     'nested/directory/part1/data.parquet',
    ...     'nested/directory/part3/data.parquet',
    ... ]
    >>> dataset(paths, filesystem='s3://bucket/', format='parquet')

    Similarly to the local example, the directory prefix may be included in the
    filesystem URI:

    >>> dataset(paths, filesystem='s3://bucket/nested/directory',
    ...         format='parquet')

    Construction of a nested dataset:

    >>> dataset([
    ...     dataset("s3://old-taxi-data", format="parquet"),
    ...     dataset("local/path/to/data", format="ipc")
    ... ])
    """
    # collect the keyword arguments for later reuse
    kwargs = dict(schema=schema,
                  filesystem=filesystem,
                  partitioning=partitioning,
                  format=format,
                  partition_base_dir=partition_base_dir,
                  exclude_invalid_files=exclude_invalid_files,
                  selector_ignore_prefixes=ignore_prefixes)

    if _is_path_like(source):
        return _filesystem_dataset(source, **kwargs)
    elif isinstance(source, (tuple, list)):
        if all(_is_path_like(elem) for elem in source):
            return _filesystem_dataset(source, **kwargs)
        elif all(isinstance(elem, Dataset) for elem in source):
            return _union_dataset(source, **kwargs)
        elif all(
                isinstance(elem, (pa.RecordBatch, pa.Table))
                for elem in source):
            return _in_memory_dataset(source, **kwargs)
        else:
            unique_types = set(type(elem).__name__ for elem in source)
            type_names = ', '.join('{}'.format(t) for t in unique_types)
            raise TypeError(
                'Expected a list of path-like or dataset objects, or a list '
                'of batches or tables. The given list contains the following '
                'types: {}'.format(type_names))
    elif isinstance(source, (pa.RecordBatch, pa.Table)):
        return _in_memory_dataset(source, **kwargs)
    else:
        raise TypeError(
            'Expected a path-like, list of path-likes or a list of Datasets '
            'instead of the given type: {}'.format(type(source).__name__))