Пример #1
0
def source(path_or_paths, filesystem=None, partitioning=None, format=None):
    """
    Open a (multi-file) data source.

    Parameters
    ----------
    path_or_paths : str, pathlib.Path, or list of those
        Path to a file or to a directory containing the data files, or
        a list of paths.
    filesystem : FileSystem, default None
        By default will be inferred from the path.
    partitioning : Partitioning(Factory), str or list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    format : str, default None
        Currently only "parquet" is supported.

    Returns
    -------
    DataSource of DataSourceDiscovery

    """
    fs, paths_or_selector = _ensure_fs_and_paths(path_or_paths, filesystem)
    partitioning = _ensure_partitioning(partitioning)
    format = _ensure_format(format or "parquet")

    # TODO pass through options
    options = FileSystemFactoryOptions()
    if isinstance(partitioning, PartitioningFactory):
        options.partitioning_factory = partitioning
    elif isinstance(partitioning, Partitioning):
        options.partitioning = partitioning

    return FileSystemSourceFactory(fs, paths_or_selector, format, options)
Пример #2
0
def source(path_or_paths, filesystem=None, partitioning=None, format=None):
    """
    Open a (multi-file) data source.

    Parameters
    ----------
    path_or_paths : str, pathlib.Path, or list of those
        Path to a file or to a directory containing the data files, or
        a list of paths.
    filesystem : FileSystem, default None
        By default will be inferred from the path.
    partitioning : Partitioning(Factory), str or list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    format : str
        Currently only "parquet" is supported.

    Returns
    -------
    DataSource of DataSourceDiscovery

    """
    filesystem, paths_or_selector = _ensure_fs_and_paths(
        path_or_paths, filesystem)

    partitioning = _ensure_partitioning(partitioning)

    format = format or "parquet"
    if format == "parquet":
        format = ParquetFileFormat()
    elif not isinstance(format, FileFormat):
        raise ValueError("format '{0}' is not supported".format(format))

    # TODO pass through options
    options = FileSystemFactoryOptions()

    if isinstance(partitioning, PartitioningFactory):
        options.partitioning_factory = partitioning
    elif isinstance(partitioning, Partitioning):
        options.partitioning = partitioning

    discovery = FileSystemSourceFactory(filesystem, paths_or_selector, format,
                                        options)

    # TODO return Source if a specific schema was passed?

    # need to return SourceFactory since `dataset` might need to
    # finish the factory with a unified schema
    return discovery
Пример #3
0
def factory(path_or_paths, filesystem=None, partitioning=None,
            format=None):
    """
    Create a factory which can be used to build a Dataset.

    Parameters
    ----------
    path_or_paths : str, pathlib.Path, or list of those
        Path to a file or to a directory containing the data files, or
        a list of paths.
    filesystem : FileSystem, default None
        By default will be inferred from the path.
    partitioning : Partitioning or PartitioningFactory or str or list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    format : str, default None
        Currently only "parquet" is supported.

    Returns
    -------
    FileSystemDatasetFactory
    """
    if not isinstance(path_or_paths, (list, tuple)):
        path_or_paths = [path_or_paths]

    partitioning = _ensure_partitioning(partitioning)
    format = _ensure_format(format or "parquet")

    # TODO pass through options
    options = FileSystemFactoryOptions()
    if isinstance(partitioning, PartitioningFactory):
        options.partitioning_factory = partitioning
    elif isinstance(partitioning, Partitioning):
        options.partitioning = partitioning

    factories = []
    for path in path_or_paths:
        fs, paths_or_selector = _ensure_fs_and_paths(path, filesystem)
        factories.append(FileSystemDatasetFactory(fs, paths_or_selector,
                                                  format, options))

    if len(factories) == 0:
        raise ValueError("Need at least one path")
    elif len(factories) == 1:
        return factories[0]
    else:
        return UnionDatasetFactory(factories)