def test_filesystem_dataset(mockfs): schema = pa.schema([]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] source = ds.FileSystemDataset(schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions) assert isinstance(source.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('source'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] source = ds.FileSystemDataset(paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format) assert source.partition_expression.equals(root_partition) assert set(source.files) == set(paths)
def test_filesystem_dataset(mockfs): schema = pa.schema([pa.field('const', pa.int64())]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset(schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions) assert isinstance(dataset.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] dataset = ds.FileSystemDataset(paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals( ds.AndExpression(root_partition, partition)) assert fragment.path == path assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} # test predicate pushdown using row group metadata fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2 assert len(list(fragments[0].get_row_group_fragments())) == 1 assert len(list(fragments[1].get_row_group_fragments())) == 0
def test_filesystem_dataset(mockfs): schema = pa.schema([]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset( schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions ) assert isinstance(dataset.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337) ) partitions = [ ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1) ), ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2) ) ] dataset = ds.FileSystemDataset( paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format ) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) assert fragments[0].partition_expression.equals( ds.AndExpression(root_partition, partitions[0])) assert fragments[1].partition_expression.equals( ds.AndExpression(root_partition, partitions[1])) assert fragments[0].path == paths[0] assert fragments[1].path == paths[1]
def _read_map_parquet(healsparse_class, filepath, pixels=None, header=False, degrade_nside=None, weightfile=None, reduction='mean', use_threads=False): """ Internal function to read in a HealSparseMap from a parquet dataset. Parameters ---------- healsparse_class : `type` Type value of the HealSparseMap class. filepath : `str` Name of the file path to read. Must be a parquet dataset. pixels : `list`, optional List of coverage map pixels to read. header : `bool`, optional Return the parquet metadata as well as map? Default is False. degrade_nside : `int`, optional Degrade map to this nside on read. None means leave as-is. Not yet implemented for parquet. weightfile : `str`, optional Floating-point map to supply weights for degrade wmean. Must be a HealSparseMap (weighted degrade not supported for healpix degrade-on-read). Not yet implemented for parquet. reduction : `str`, optional Reduction method with degrade-on-read. (mean, median, std, max, min, and, or, sum, prod, wmean). Not yet implemented for parquet. use_threads : `bool`, optional Use multithreaded reading. Returns ------- healSparseMap : `HealSparseMap` HealSparseMap from file, covered by pixels header : `astropy.io.fits.Header` (if header=True) Header metadata for the map file. """ ds = dataset.dataset(filepath, format='parquet', partitioning='hive') schema = ds.schema # Convert from byte strings md = { key.decode(): schema.metadata[key].decode() for key in schema.metadata } if 'healsparse::filetype' not in md: raise RuntimeError("Filepath %s is not a healsparse parquet map." % (filepath)) if md['healsparse::filetype'] != 'healsparse': raise RuntimeError("Filepath %s is not a healsparse parquet map." % (filepath)) cov_fname = os.path.join(filepath, '_coverage.parquet') if not os.path.isfile(cov_fname): # Note that this could be reconstructed from the information in the file # inefficiently. This feature could be added in the future. raise RuntimeError("Filepath %s is missing coverage map %s" % (filepath, cov_fname)) nside_sparse = int(md['healsparse::nside_sparse']) nside_coverage = int(md['healsparse::nside_coverage']) nside_io = int(md['healsparse::nside_io']) bitshift_io = _compute_bitshift(nside_io, nside_coverage) cov_tab = parquet.read_table(cov_fname, use_threads=use_threads) cov_pixels = cov_tab['cov_pix'].to_numpy() row_groups = cov_tab['row_group'].to_numpy() if pixels is not None: _pixels = np.atleast_1d(pixels) if len(np.unique(_pixels)) < len(_pixels): raise RuntimeError("Input list of pixels must be unique.") sub = np.clip(np.searchsorted(cov_pixels, _pixels), 0, cov_pixels.size - 1) ok, = np.where(cov_pixels[sub] == _pixels) if ok.size == 0: raise RuntimeError( "None of the specified pixels are in the coverage map.") _pixels = np.sort(_pixels[ok]) _pixels_io = np.right_shift(_pixels, bitshift_io) # Figure out row groups... matches = np.searchsorted(cov_pixels, _pixels) _row_groups_io = row_groups[matches] else: _pixels = cov_pixels _pixels_io = None _row_groups_io = None cov_map = HealSparseCoverage.make_from_pixels(nside_coverage, nside_sparse, _pixels) if md['healsparse::widemask'] == 'True': is_wide_mask = True wmult = int(md['healsparse::wwidth']) else: is_wide_mask = False wmult = 1 if md['healsparse::primary'] != '': # This is a multi-column table. is_rec_array = True primary = md['healsparse::primary'] columns = [ name for name in schema.names if name not in ['iopix', 'cov_pix'] ] dtype = [(name, schema.field(name).type.to_pandas_dtype()) for name in columns] primary_dtype = schema.field(primary).type.to_pandas_dtype() else: is_rec_array = False primary = None dtype = schema.field('sparse').type.to_pandas_dtype() primary_dtype = dtype columns = ['sparse'] if md['healsparse::sentinel'] == 'UNSEEN': sentinel = primary_dtype(hpg.UNSEEN) elif md['healsparse::sentinel'] == 'False': sentinel = False elif md['healsparse::sentinel'] == 'True': sentinel = True else: sentinel = primary_dtype(md['healsparse::sentinel']) if is_integer_value(sentinel): sentinel = int(sentinel) elif not isinstance(sentinel, np.bool_): sentinel = float(sentinel) if is_rec_array: sparse_map = np.zeros((_pixels.size + 1) * cov_map.nfine_per_cov, dtype=dtype) # Fill in the overflow (primary) sparse_map[primary][:cov_map.nfine_per_cov] = sentinel # Fill in the overflow (not primary) for d in dtype: if d[0] == primary: continue sparse_map[d[0]][:cov_map.nfine_per_cov] = check_sentinel( d[1], None) else: sparse_map = np.zeros( (_pixels.size + 1) * cov_map.nfine_per_cov * wmult, dtype=dtype) sparse_map[:cov_map.nfine_per_cov * wmult] = sentinel if _pixels_io is None: # Read the full table tab = ds.to_table(columns=columns, use_threads=use_threads) else: _pixels_io_unique = list(np.unique(_pixels_io)) fragments = list( ds.get_fragments( filter=dataset.field('iopix').isin(_pixels_io_unique))) group_fragments = [] for pixel_io, fragment in zip(_pixels_io_unique, fragments): groups = fragment.split_by_row_group() # Only append groups that are relevant use, = np.where(_pixels_io == pixel_io) for ind in use: group_fragments.append(groups[_row_groups_io[ind]]) ds2 = dataset.FileSystemDataset(group_fragments, schema, ds.format) tab = ds2.to_table(columns=columns, use_threads=use_threads) if is_rec_array: for name in columns: sparse_map[name][cov_map.nfine_per_cov:] = tab[name].to_numpy() else: sparse_map[cov_map.nfine_per_cov * wmult:] = tab['sparse'].to_numpy() if is_wide_mask: sparse_map = sparse_map.reshape( (sparse_map.size // wmult, wmult)).astype(WIDE_MASK) healsparse_map = healsparse_class(cov_map=cov_map, sparse_map=sparse_map, nside_sparse=nside_sparse, primary=primary, sentinel=sentinel) if header: if 'healsparse::header' in md: hdr_string = md['healsparse::header'] hdr = fits.Header.fromstring(hdr_string) else: hdr = fits.Header() return (healsparse_map, hdr) else: return healsparse_map
def test_filesystem_dataset(mockfs): schema = pa.schema([ pa.field('const', pa.int64()) ]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.scalar(True), ds.scalar(True)] dataset = ds.FileSystemDataset( schema=schema, format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions ) assert isinstance(dataset.format, ds.ParquetFileFormat) # the root_partition and partitions keywords have defaults dataset = ds.FileSystemDataset( paths, schema, format=file_format, filesystem=mockfs, ) assert isinstance(dataset.format, ds.ParquetFileFormat) # validation of required arguments with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format) # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format, filesystem=mockfs, root_partition=1) root_partition = ds.field('level') == ds.scalar(1337) partitions = [ds.field('part') == x for x in range(1, 3)] dataset = ds.FileSystemDataset( paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, format=file_format ) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals(partition) assert fragment.path == path assert isinstance(fragment.format, ds.ParquetFileFormat) assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2