Python FileSystemDataset示例

编程语言: Python

命名空间/包名称: pyarrow.dataset

方法/功能: FileSystemDataset

hotexamples.com的示例: 5

Python FileSystemDataset - 已找到5个示例。这些是从开源项目中提取的最受好评的pyarrow.dataset.FileSystemDataset现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def test_filesystem_dataset(mockfs):
    schema = pa.schema([])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    source = ds.FileSystemDataset(schema,
                                  root_partition=None,
                                  file_format=file_format,
                                  filesystem=mockfs,
                                  paths_or_selector=paths,
                                  partitions=partitions)
    assert isinstance(source.format, ds.ParquetFileFormat)

    root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                             ds.FieldExpression('source'),
                                             ds.ScalarExpression(1337))
    partitions = [
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(1)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(2))
    ]
    source = ds.FileSystemDataset(paths_or_selector=paths,
                                  schema=schema,
                                  root_partition=root_partition,
                                  filesystem=mockfs,
                                  partitions=partitions,
                                  file_format=file_format)
    assert source.partition_expression.equals(root_partition)
    assert set(source.files) == set(paths)

示例#2

显示文件

def test_filesystem_dataset(mockfs):
    schema = pa.schema([pa.field('const', pa.int64())])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    dataset = ds.FileSystemDataset(schema,
                                   root_partition=None,
                                   file_format=file_format,
                                   filesystem=mockfs,
                                   paths_or_selector=paths,
                                   partitions=partitions)
    assert isinstance(dataset.format, ds.ParquetFileFormat)

    root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                             ds.FieldExpression('level'),
                                             ds.ScalarExpression(1337))
    partitions = [
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(1)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(2))
    ]
    dataset = ds.FileSystemDataset(paths_or_selector=paths,
                                   schema=schema,
                                   root_partition=root_partition,
                                   filesystem=mockfs,
                                   partitions=partitions,
                                   file_format=file_format)
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    for fragment, partition, path in zip(fragments, partitions, paths):
        assert fragment.partition_expression.equals(
            ds.AndExpression(root_partition, partition))
        assert fragment.path == path
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert fragment.row_groups is None

        row_group_fragments = list(fragment.get_row_group_fragments())
        assert len(row_group_fragments) == 1
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert row_group_fragments[0].path == path
        assert row_group_fragments[0].row_groups == {0}

    # test predicate pushdown using row group metadata
    fragments = list(dataset.get_fragments(filter=ds.field("const") == 0))
    assert len(fragments) == 2
    assert len(list(fragments[0].get_row_group_fragments())) == 1
    assert len(list(fragments[1].get_row_group_fragments())) == 0

示例#3

显示文件

def test_filesystem_dataset(mockfs):
    schema = pa.schema([])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    dataset = ds.FileSystemDataset(
        schema,
        root_partition=None,
        file_format=file_format,
        filesystem=mockfs,
        paths_or_selector=paths,
        partitions=partitions
    )
    assert isinstance(dataset.format, ds.ParquetFileFormat)

    root_partition = ds.ComparisonExpression(
        ds.CompareOperator.Equal,
        ds.FieldExpression('level'),
        ds.ScalarExpression(1337)
    )
    partitions = [
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('part'),
            ds.ScalarExpression(1)
        ),
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('part'),
            ds.ScalarExpression(2)
        )
    ]
    dataset = ds.FileSystemDataset(
        paths_or_selector=paths,
        schema=schema,
        root_partition=root_partition,
        filesystem=mockfs,
        partitions=partitions,
        file_format=file_format
    )
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    assert fragments[0].partition_expression.equals(
        ds.AndExpression(root_partition, partitions[0]))
    assert fragments[1].partition_expression.equals(
        ds.AndExpression(root_partition, partitions[1]))
    assert fragments[0].path == paths[0]
    assert fragments[1].path == paths[1]

示例#4

显示文件

def _read_map_parquet(healsparse_class,
                      filepath,
                      pixels=None,
                      header=False,
                      degrade_nside=None,
                      weightfile=None,
                      reduction='mean',
                      use_threads=False):
    """
    Internal function to read in a HealSparseMap from a parquet dataset.

    Parameters
    ----------
    healsparse_class : `type`
        Type value of the HealSparseMap class.
    filepath : `str`
        Name of the file path to read.  Must be a parquet dataset.
    pixels : `list`, optional
        List of coverage map pixels to read.
    header : `bool`, optional
        Return the parquet metadata as well as map?  Default is False.
    degrade_nside : `int`, optional
        Degrade map to this nside on read.  None means leave as-is.
        Not yet implemented for parquet.
    weightfile : `str`, optional
        Floating-point map to supply weights for degrade wmean.  Must
        be a HealSparseMap (weighted degrade not supported for
        healpix degrade-on-read).
        Not yet implemented for parquet.
    reduction : `str`, optional
        Reduction method with degrade-on-read.
        (mean, median, std, max, min, and, or, sum, prod, wmean).
        Not yet implemented for parquet.
    use_threads : `bool`, optional
        Use multithreaded reading.

    Returns
    -------
    healSparseMap : `HealSparseMap`
        HealSparseMap from file, covered by pixels
    header : `astropy.io.fits.Header` (if header=True)
        Header metadata for the map file.
    """
    ds = dataset.dataset(filepath, format='parquet', partitioning='hive')
    schema = ds.schema
    # Convert from byte strings
    md = {
        key.decode(): schema.metadata[key].decode()
        for key in schema.metadata
    }

    if 'healsparse::filetype' not in md:
        raise RuntimeError("Filepath %s is not a healsparse parquet map." %
                           (filepath))
    if md['healsparse::filetype'] != 'healsparse':
        raise RuntimeError("Filepath %s is not a healsparse parquet map." %
                           (filepath))
    cov_fname = os.path.join(filepath, '_coverage.parquet')
    if not os.path.isfile(cov_fname):
        # Note that this could be reconstructed from the information in the file
        # inefficiently.  This feature could be added in the future.
        raise RuntimeError("Filepath %s is missing coverage map %s" %
                           (filepath, cov_fname))

    nside_sparse = int(md['healsparse::nside_sparse'])
    nside_coverage = int(md['healsparse::nside_coverage'])
    nside_io = int(md['healsparse::nside_io'])
    bitshift_io = _compute_bitshift(nside_io, nside_coverage)

    cov_tab = parquet.read_table(cov_fname, use_threads=use_threads)
    cov_pixels = cov_tab['cov_pix'].to_numpy()
    row_groups = cov_tab['row_group'].to_numpy()

    if pixels is not None:
        _pixels = np.atleast_1d(pixels)
        if len(np.unique(_pixels)) < len(_pixels):
            raise RuntimeError("Input list of pixels must be unique.")

        sub = np.clip(np.searchsorted(cov_pixels, _pixels), 0,
                      cov_pixels.size - 1)
        ok, = np.where(cov_pixels[sub] == _pixels)
        if ok.size == 0:
            raise RuntimeError(
                "None of the specified pixels are in the coverage map.")
        _pixels = np.sort(_pixels[ok])

        _pixels_io = np.right_shift(_pixels, bitshift_io)

        # Figure out row groups...
        matches = np.searchsorted(cov_pixels, _pixels)
        _row_groups_io = row_groups[matches]
    else:
        _pixels = cov_pixels
        _pixels_io = None
        _row_groups_io = None

    cov_map = HealSparseCoverage.make_from_pixels(nside_coverage, nside_sparse,
                                                  _pixels)

    if md['healsparse::widemask'] == 'True':
        is_wide_mask = True
        wmult = int(md['healsparse::wwidth'])
    else:
        is_wide_mask = False
        wmult = 1

    if md['healsparse::primary'] != '':
        # This is a multi-column table.
        is_rec_array = True
        primary = md['healsparse::primary']
        columns = [
            name for name in schema.names if name not in ['iopix', 'cov_pix']
        ]
        dtype = [(name, schema.field(name).type.to_pandas_dtype())
                 for name in columns]
        primary_dtype = schema.field(primary).type.to_pandas_dtype()
    else:
        is_rec_array = False
        primary = None
        dtype = schema.field('sparse').type.to_pandas_dtype()
        primary_dtype = dtype
        columns = ['sparse']

    if md['healsparse::sentinel'] == 'UNSEEN':
        sentinel = primary_dtype(hpg.UNSEEN)
    elif md['healsparse::sentinel'] == 'False':
        sentinel = False
    elif md['healsparse::sentinel'] == 'True':
        sentinel = True
    else:
        sentinel = primary_dtype(md['healsparse::sentinel'])

        if is_integer_value(sentinel):
            sentinel = int(sentinel)
        elif not isinstance(sentinel, np.bool_):
            sentinel = float(sentinel)

    if is_rec_array:
        sparse_map = np.zeros((_pixels.size + 1) * cov_map.nfine_per_cov,
                              dtype=dtype)
        # Fill in the overflow (primary)
        sparse_map[primary][:cov_map.nfine_per_cov] = sentinel
        # Fill in the overflow (not primary)
        for d in dtype:
            if d[0] == primary:
                continue
            sparse_map[d[0]][:cov_map.nfine_per_cov] = check_sentinel(
                d[1], None)
    else:
        sparse_map = np.zeros(
            (_pixels.size + 1) * cov_map.nfine_per_cov * wmult, dtype=dtype)
        sparse_map[:cov_map.nfine_per_cov * wmult] = sentinel

    if _pixels_io is None:
        # Read the full table
        tab = ds.to_table(columns=columns, use_threads=use_threads)
    else:
        _pixels_io_unique = list(np.unique(_pixels_io))

        fragments = list(
            ds.get_fragments(
                filter=dataset.field('iopix').isin(_pixels_io_unique)))
        group_fragments = []
        for pixel_io, fragment in zip(_pixels_io_unique, fragments):
            groups = fragment.split_by_row_group()
            # Only append groups that are relevant
            use, = np.where(_pixels_io == pixel_io)
            for ind in use:
                group_fragments.append(groups[_row_groups_io[ind]])

        ds2 = dataset.FileSystemDataset(group_fragments, schema, ds.format)
        tab = ds2.to_table(columns=columns, use_threads=use_threads)

    if is_rec_array:
        for name in columns:
            sparse_map[name][cov_map.nfine_per_cov:] = tab[name].to_numpy()
    else:
        sparse_map[cov_map.nfine_per_cov * wmult:] = tab['sparse'].to_numpy()

        if is_wide_mask:
            sparse_map = sparse_map.reshape(
                (sparse_map.size // wmult, wmult)).astype(WIDE_MASK)

    healsparse_map = healsparse_class(cov_map=cov_map,
                                      sparse_map=sparse_map,
                                      nside_sparse=nside_sparse,
                                      primary=primary,
                                      sentinel=sentinel)

    if header:
        if 'healsparse::header' in md:
            hdr_string = md['healsparse::header']
            hdr = fits.Header.fromstring(hdr_string)
        else:
            hdr = fits.Header()

        return (healsparse_map, hdr)
    else:
        return healsparse_map

示例#5

显示文件

文件： test_dataset.py 项目： xiepeini/arrow

def test_filesystem_dataset(mockfs):
    schema = pa.schema([
        pa.field('const', pa.int64())
    ])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.scalar(True), ds.scalar(True)]

    dataset = ds.FileSystemDataset(
        schema=schema,
        format=file_format,
        filesystem=mockfs,
        paths_or_selector=paths,
        partitions=partitions
    )

    assert isinstance(dataset.format, ds.ParquetFileFormat)

    # the root_partition and partitions keywords have defaults
    dataset = ds.FileSystemDataset(
        paths, schema, format=file_format, filesystem=mockfs,
    )

    assert isinstance(dataset.format, ds.ParquetFileFormat)

    # validation of required arguments
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs)
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs)
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, format=file_format)
    # validation of root_partition
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, format=file_format,
                             filesystem=mockfs, root_partition=1)

    root_partition = ds.field('level') == ds.scalar(1337)
    partitions = [ds.field('part') == x for x in range(1, 3)]
    dataset = ds.FileSystemDataset(
        paths_or_selector=paths,
        schema=schema,
        root_partition=root_partition,
        filesystem=mockfs,
        partitions=partitions,
        format=file_format
    )
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    for fragment, partition, path in zip(fragments, partitions, paths):
        assert fragment.partition_expression.equals(partition)
        assert fragment.path == path
        assert isinstance(fragment.format, ds.ParquetFileFormat)
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert fragment.row_groups is None

        row_group_fragments = list(fragment.get_row_group_fragments())
        assert len(row_group_fragments) == 1
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert row_group_fragments[0].path == path
        assert row_group_fragments[0].row_groups == {0}

    fragments = list(dataset.get_fragments(filter=ds.field("const") == 0))
    assert len(fragments) == 2