示例#1
0
async def putStorBytes(app, key, data, filter_ops=None, bucket=None):
    """ Store byte string as S3 object with given key
    """

    client = _getStorageClient(app)
    if not bucket:
        bucket = app['bucket_name']
    if key[0] == '/':
        key = key[1:]  # no leading slash
    shuffle = -1  # auto-shuffle
    clevel = 5
    cname = None  # compressor name
    if filter_ops:
        if "compressor" in filter_ops:
            cname = filter_ops["compressor"]
        if "use_shuffle" in filter_ops and not filter_ops['use_shuffle']:
            shuffle = 0 # client indicates to turn off shuffling
        if "level" in filter_ops:
            clevel = filter_ops["level"]
    log.info(f"putStorBytes({bucket}/{key}), {len(data)} bytes shuffle: {shuffle} compressor: {cname} level: {clevel}")
   
    if cname:
        try:
            blosc = codecs.Blosc(cname=cname, clevel=clevel, shuffle=shuffle)
            cdata = blosc.encode(data)
            # TBD: add cname in blosc constructor
            log.info(f"compressed from {len(data)} bytes to {len(cdata)} bytes using filter: {blosc.cname} with level: {blosc.clevel}")
            data = cdata
        except Exception as e:
            log.error(f"got exception using blosc encoding: {e}")
            raise HTTPInternalServerError()

    rsp = await client.put_object(key, data, bucket=bucket)

    return rsp
示例#2
0
def process_one_tile(lat, lon):
    """
    Given lat and lon to select a region, calculate the
    corresponding emissions for each year from 2001 to 2018

    Parameters
    ----------
    lat : float
        Latitude in degrees
    lon : float
        Longitude in degrees

    Returns
    -------
    url : string
        Url where a processed tile is located
    """

    url = f"gs://carbonplan-climatetrace/v0/tiles/{lat}_{lon}.zarr"

    encoding = {"emissions": {"compressor": numcodecs.Blosc()}}

    mapper = fsspec.get_mapper(url)

    with dask.config.set(scheduler="threads"):
        ds = open_hansen_2018_tile(lat, lon)
        ds = calc_one_tile(ds)[["emissions"]]
        ds = ds.chunk({"lat": 4000, "lon": 4000, "year": 2})
        ds.to_zarr(mapper, encoding=encoding, mode="w", consolidated=True)
        return url
示例#3
0
    def test_read_zarr(self):
        from z5py.dataset import Dataset
        dtypes = list(Dataset._dtype_dict.keys())
        zarr_compressors = {'blosc': numcodecs.Blosc(),
                            'zlib': numcodecs.Zlib(),
                            'raw': None,
                            'bzip2': numcodecs.BZ2()}

        # conda-forge version of numcodecs is not up-to-data
        # for python 3.5 and GZip is missing
        # thats why we need to check explicitly here to not fail the test
        if hasattr(numcodecs, 'GZip'):
            zarr_compressors.update({'gzip': numcodecs.GZip()})

        zarr.open(self.path)
        for dtype in dtypes:
            for compression in zarr_compressors:
                data = np.random.randint(0, 127, size=self.shape).astype(dtype)
                # write the data with zarr
                key = 'test_%s_%s' % (dtype, compression)
                ar = zarr.open(os.path.join(self.path, key), mode='w',
                               shape=self.shape, chunks=self.chunks,
                               dtype=dtype, compressor=zarr_compressors[compression])
                ar[:] = data
                # read with z5py
                out = z5py.File(self.path)[key][:]
                self.assertEqual(data.shape, out.shape)
                self.assertTrue(np.allclose(data, out))
示例#4
0
    def test_read_zarr(self):
        import numcodecs
        from z5py.dataset import Dataset
        dtypes = list(Dataset._zarr_dtype_dict.values())
        compressions = Dataset.compressors_zarr
        zarr_compressors = {
            'blosc': numcodecs.Blosc(),
            'zlib': numcodecs.Zlib(),
            'raw': None,
            'bzip2': numcodecs.BZ2()
        }

        for dtype in dtypes:
            for compression in compressions:
                data = np.random.randint(0, 127, size=self.shape).astype(dtype)
                # write the data with zarr
                key = 'test_%s_%s' % (dtype, compression)
                ar = zarr.open(os.path.join(self.path, key),
                               mode='w',
                               shape=self.shape,
                               chunks=self.chunks,
                               dtype=dtype,
                               compressor=zarr_compressors[compression])
                ar[:] = data
                # read with z5py
                out = z5py.File(self.path)[key][:]
                self.assertEqual(data.shape, out.shape)
                self.assertTrue(np.allclose(data, out))
示例#5
0
def save_da_to_zarr(da,
                    zarr_bucket,
                    dim_order=['time', 'x', 'y', 'variable'],
                    zarr_mode='a'):
    da = da.transpose(*dim_order)
    da['time'] = get_time_as_unix(da)

    _, y_size, x_size, _ = da.shape
    out_store = gcsfs.GCSMap(root=zarr_bucket, gcs=gcsfs.GCSFileSystem())

    chunks = (36, y_size, x_size, 1)

    ds = xr.Dataset({'stacked_eumetsat_data': da.chunk(chunks)})

    zarr_mode_to_extra_kwargs = {
        'a': {
            'append_dim': 'time'
        },
        'w': {
            'encoding': {
                'stacked_eumetsat_data': {
                    'compressor': numcodecs.Blosc(cname='zstd', clevel=5),
                    'chunks': chunks
                }
            }
        }
    }

    assert zarr_mode in ['a', 'w'], '`zarr_mode` must be one of: `a`, `w`'
    extra_kwargs = zarr_mode_to_extra_kwargs[zarr_mode]

    ds.to_zarr(out_store, mode=zarr_mode, consolidated=True, **extra_kwargs)
    print('Saved file to zarr bucket')
    return ds
示例#6
0
 def __init__(self,
              fs: fsspec.AbstractFileSystem,
              root: str,
              compressor: Optional[numcodecs.Blosc] = None):
     self.fs = fs
     self.compressor = compressor or numcodecs.Blosc()
     self.root = root
     self._transactions = set()
     self._deleted = set()
     self.fs.mkdirs(root, exist_ok=True)
示例#7
0
def create_coarsened_global_raster():
    with fsspec.open(HANSEN_FILE_LIST) as f:
        lines = f.read().decode().splitlines()
    print("We are working with {} different files".format(len(lines)))

    # the arrays where you'll throw your active lat/lon permutations
    lats = []
    lons = []

    encoding = {"emissions": {"compressor": numcodecs.Blosc()}}

    for line in lines:
        pieces = line.split("_")
        lat = pieces[-2]
        lon = pieces[-1].split(".")[0]

        if (lat in LATS_TO_RUN) and (lon in LONS_TO_RUN):
            lats.append(lat)
            lons.append(lon)
    all_to_do = len(lats)
    done = 0
    list_all_coarsened = []
    for lat, lon in list(zip(lats, lons)):

        try:
            # We only have data over land so this will throw
            # an exception if the tile errors (likely for lack of data - could be improved to check
            # that it fails precisely because it is an ocean tile - aka we check that all of the land
            # cells run appropriately)
            mapper = fsspec.get_mapper(OUT_TILE_TEMPLATE.format(lat, lon))
            da_global = xr.open_zarr(mapper, consolidated=True)
            # We only want to create the
            da_mask = da_global.isel(year=0, drop=True)
            da_area = compute_grid_area(da_mask)
            list_all_coarsened.append((da_global * da_area).coarsen(
                lat=COARSENING_FACTOR,
                lon=COARSENING_FACTOR).sum().compute(retries=4))
        except ValueError:
            print("{} {} did not work (likely because it is ocean) booooo".
                  format(lat, lon))
        done += 1
        print("completed {} of {} tiles".format(done, all_to_do))
    coarsened_url = OUT_RASTER_FILE

    mapper = fsspec.get_mapper(coarsened_url)

    combined_ds = xr.combine_by_coords(list_all_coarsened,
                                       compat="override",
                                       coords="minimal")
    combined_ds = combined_ds.chunk({"lat": -1, "lon": -1, "year": 1})
    task = combined_ds.to_zarr(mapper,
                               encoding=encoding,
                               mode="w",
                               compute=False)
    dask.compute(task, retries=4)
示例#8
0
    def create(self, mode="w", compressor=numcodecs.Blosc("zstd", 5)):
        """
        Create or open for append a dataset

        :param n_channels: # of channels in the zarr
        :param current_channel: the channel to be written
        """
        store = zarr.NestedDirectoryStore(
            self.dest)
        self.zgroup = zarr.group(store,
                                 overwrite=(mode == "w"))
        self.compressor = compressor
示例#9
0
 def create(cls, path: Path, array_info: ArrayInfo) -> "ZarrArray":
     assert array_info.data_format == cls.data_format
     assert array_info.chunks_per_shard == Vec3Int.full(
         1), "Zarr storage doesn't support sharding yet"
     zarr.create(
         shape=(array_info.num_channels, 1, 1, 1),
         chunks=(array_info.num_channels, ) +
         array_info.chunk_size.to_tuple(),
         dtype=array_info.voxel_type,
         compressor=(numcodecs.Blosc(
             cname="zstd", clevel=3, shuffle=numcodecs.Blosc.SHUFFLE)
                     if array_info.compression_mode else None),
         store=_fsstore_from_path(path),
         order="F",
     )
     return ZarrArray(path)
示例#10
0
    def __init__(self,
                 filename,
                 overwrite=False,
                 separate=False,
                 out_block_type='zarr',
                 keep_blocks=False,
                 gdal_cache=512,
                 **kwargs):

        if out_block_type == 'zarr':
            if not ZARR_INSTALLED:
                logger.exception('Zarr and numcodecs must be installed.')

        self.filename = filename
        self.overwrite = overwrite
        self.separate = separate
        self.out_block_type = out_block_type
        self.keep_blocks = keep_blocks
        self.gdal_cache = gdal_cache
        self.kwargs = kwargs

        self.d_name, f_name = os.path.split(self.filename)
        self.f_base, self.f_ext = os.path.splitext(f_name)

        self.root = None
        self.compressor = None
        self.sub_dir = None
        self.zarr_file = None

        if self.separate:

            if self.out_block_type.lower() not in ['gtiff', 'zarr']:

                logger.warning('  The output block type is not recognized. Save blocks as zarr files.')
                self.out_block_type = 'zarr'

            self.sub_dir = os.path.join(self.d_name, 'sub_tmp_')
            self.zarr_file = os.path.join(self.sub_dir, 'data.zarr')

            self.compressor = numcodecs.Blosc(cname='zstd',
                                              clevel=3,
                                              shuffle=numcodecs.Blosc.BITSHUFFLE)

            if os.path.isdir(self.sub_dir):
                shutil.rmtree(self.sub_dir)

            os.makedirs(self.sub_dir)
示例#11
0
def output_to_zarr(path, seq_id, sample_id, arrays, cname, clevel, shuffle):

    log('Output zarr to {!r} ...'.format(path))

    store = zarr.ZipStore(path, mode='w')
    root = zarr.group(store=store)
    callset = root.create_group(sample_id)
    seq_group = callset.require_group(seq_id)
    calldata_group = seq_group.require_group('calldata')
    variants_group = seq_group.require_group('variants')

    compressor = numcodecs.Blosc(cname=cname, clevel=clevel, shuffle=shuffle)

    for key, value in arrays.items():
        calldata_group.create_dataset(key, data=value, compressor=compressor)
        log('Created output array: ' + repr(key))

    store.close()
示例#12
0
def setup_output(output_path, seqid, field, example_arr, samples, cname,
                 clevel, shuffle, chunk_width):
    log('Setting up output at {!r} ...'.format(output_path))
    callset = zarr.open_group(output_path, mode='a')
    seq_group = callset.require_group(seqid)
    field_root, field_id = field.split("/")
    root_group = seq_group.require_group(field_root)
    output_shape = (example_arr.shape[0], len(samples)) + example_arr.shape[2:]

    c1 = 2**26 // np.prod((chunk_width, ) + example_arr.shape[2:])
    output_chunks = (c1, chunk_width) + example_arr.chunks[2:]

    compressor = numcodecs.Blosc(cname=cname, clevel=clevel, shuffle=shuffle)
    output_arr = root_group.empty_like(field_id,
                                       example_arr,
                                       shape=output_shape,
                                       chunks=output_chunks,
                                       overwrite=True,
                                       compressor=compressor)
    log('Created output array: ' + repr(output_arr))
    return output_arr
示例#13
0
def vcf_to_zarr(vcf_in, tabix_exec, chrom):
    """Convert on-disk VCF to on-disk Zarr database using
    scikit-allele and zarr modules

    Zarr database written to same directory as input VCF
    
    Args:
        vcf_in (str): Path to input VCF on disk
        tabix_exec (str): Full path to tabix executable
        chrom (str): Chromosome for which Zarr database should be created

    Returns:
        None
    """
    vcf_path = os.path.dirname(vcf_in)

    # allel.vcf_to_zarr returns a directory with Zarr databse
    # Set Zarr database outdir
    zarr_base = os.path.basename(vcf_in).split('.')[0]
    zarr_out = vcf_path + '/' + zarr_base + '.zarr'

    # Rename 'numalt' field. Required by Zarr to distinguish `NUMALT` from `numalt`
    # `numalt` is automatically computed by scikit-allel
    rename_dict = {'variants/numalt':'variants/numalt_sci'}

    # Use vcf_to_zarr function from scikit-allel to create zarr database
    # Currently optimized for biallelic SNP VCF but easy to extend functionality
    allel.vcf_to_zarr(
            input=vcf_in,
            output=zarr_out,
            overwrite=True,
            group=chrom,
            rename_fields=rename_dict,
            fields='*',
            alt_number=1,
            tabix=tabix_exec,
            region=chrom,
            compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False)
            )
示例#14
0
def save_samples(path, mcmc, title=None):
    path = os.fspath(path)

    names = pd.Series(mcmc.column_names).str.replace(r'\.\d+$', '').unique()
    _log.info('saving %d draws of %d variables to %s',
              mcmc.chains * mcmc.draws, len(names), path)

    comp = nc.Blosc('zstd', 9, shuffle=nc.blosc.BITSHUFFLE)
    with zarr.ZipStore(path) as store:
        g = zarr.group(store)
        for name in names:
            if name.startswith('_') or name == 'log_lik':
                continue  # we don't save names prefixed with _
            draws = mcmc.get_drawset([name])
            nrows, ncols = draws.shape
            if ncols > 1:
                _log.info('saving %d draws of %d-dimensional vector %s', nrows,
                          ncols, name)
                arr = draws.to_numpy()
            else:
                _log.info('saving %d draws of scalar %s', nrows, name)
                arr = draws.to_numpy().reshape(nrows)
            g.array(name, arr, compressor=comp)

        if 'log_lik' in names:
            _log.info('computing LPPD')
            ll = mcmc.get_drawset(['log_lik'])
            draws, dims = ll.shape
            ll_exp = logsumexp(ll, axis=0) - np.log(draws)
            ll_var = np.var(ll, axis=0)
            lppd = np.sum(ll_exp)
            pwaic = np.sum(ll_var)
            _log.info('LPPD=%.2f, pWAIC=%.2f, WAIC=%.2f', lppd, pwaic,
                      -2 * (lppd - pwaic))
            g.array('ll_exp', ll_exp, compressor=comp)
            g.array('ll_var', ll_var, compressor=comp)
示例#15
0
def vcf2zarr(chrom, zarr_path, vcf_path):
    """Convert vcf to zarr.

    Parameters
    ----------
    chroms : TYPE
        DESCRIPTION.
    zarr_path : TYPE
        DESCRIPTION.
    vcf_path : TYPE
        DESCRIPTION.

    Returns
    -------
    None.

    """
    if path.isdir(path.join(zarr_path, chrom)):
        pass
    else:
        allel.vcf_to_zarr(vcf_path, zarr_path, group=chrom,
                          fields='*', alt_number=2, log=sys.stdout,
                          compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False))
    return None
 def codecs(self, obj):
     codecs = []
     if obj.dtype == np.float64:
         codecs.append(nc.AsType('f4', 'f8'))
     codecs.append(nc.Blosc('zstd', 5))
     return codecs
ds["nav_lat"] = template["nav_lat"]

ds["nav_lon"] = template["nav_lon"]

ds

del template



ds



compressor = numcodecs.Blosc(cname='snappy', clevel=6, shuffle=-1)
encoding = {vname: {'compressor': compressor} for vname in ds.variables}



outdir = '/store/albert7a/eNATL60/zarr/eNATL60-BLB002-SSH-1h-new'
ds = ds.chunk(chunks=dict(time_counter=240, y=240, x=480))

print (str(datetime.datetime.now()))
ds.to_zarr(outdir, encoding=encoding, mode="w")
print (str(datetime.datetime.now()))

 


示例#18
0
async def getStorBytes(app,
                       key,
                       filter_ops=None,
                       offset=0,
                       length=None,
                       bucket=None):
    """ Get object identified by key and read as bytes
    """

    client = _getStorageClient(app)
    if not bucket:
        bucket = app['bucket_name']
    if key[0] == '/':
        key = key[1:]  # no leading slash
    log.info(f"getStorBytes({bucket}/{key})")

    shuffle = 0
    compressor = None
    if filter_ops:
        log.debug(f"getStorBytes for {key} with filter_ops: {filter_ops}")
        if "is_shuffle" in filter_ops and filter_ops['is_shuffle']:
            shuffle = filter_ops['item_size']
        if "compressor" in filter_ops:
            # TBD - enable blosc compressors
            compressor = filter_ops["compressor"]

    data = await client.get_object(bucket=bucket,
                                   key=key,
                                   offset=offset,
                                   length=length)
    if data is None or len(data) == 0:
        log.info(f"no data found for {key}")
        return data

    log.info(f"read: {len(data)} bytes for key: {key}")
    if compressor:

        # compressed chunk data...

        # first check if this was compressed with blosc
        blosc_metainfo = codecs.blosc.cbuffer_metainfo(
            data)  # returns typesize, isshuffle, and memcopied
        if blosc_metainfo[0] > 0:
            log.info(f"blosc compressed data for {key}")
            try:
                blosc = codecs.Blosc()
                udata = blosc.decode(data)
                log.info(f"uncompressed to {len(udata)} bytes")
                data = udata
            except Exception as e:
                log.error(
                    f"got exception: {e} using blosc decompression for {key}")
                raise HTTPInternalServerError()
        elif compressor == "zlib":
            # data may have been compressed without blosc, try using zlib directly
            log.info(f"using zlib to decompress {key}")
            try:
                udata = zlib.decompress(data)
                log.info(f"uncompressed to {len(udata)} bytes")
                data = udata
            except zlib.error as zlib_error:
                log.info(f"zlib_err: {zlib_error}")
                log.error(f"unable to uncompress obj: {key}")
                raise HTTPInternalServerError()
        else:
            log.error(
                f"don't know how to decompress data in {compressor} format for {key}"
            )
            raise HTTPInternalServerError()

    if shuffle > 0:
        log.debug(f"shuffle is {shuffle}")
        unshuffled = _unshuffle(shuffle, data)
        if unshuffled is not None:
            log.debug(f"unshuffled to {len(unshuffled)} bytes")
            data = unshuffled

    return data
示例#19
0
文件: zarr_.py 项目: jgrss/geowombat
from pathlib import Path

import zarr
import numcodecs


if hasattr(numcodecs, 'blosc'):

    numcodecs.blosc.use_threads = False

    compressor = numcodecs.Blosc(cname='zstd',
                                 clevel=2,
                                 shuffle=numcodecs.Blosc.BITSHUFFLE)


def to_zarr(filename, data, window, chunks, root=None):

    """
    Writes data to a zarr file

    Args:
        filename (str): The output file name.
        data (ndarray): The data to write.
        window (namedtuple): A ``rasterio.window.Window`` object.
        chunks (int or tuple): The ``zarr`` chunks.
        root (Optional[object]): The ``zarr`` root.

    Returns:
        ``str``
    """
示例#20
0
def compress_zarr(ts, root, variants_only=False):

    provenance_dict = provenance.get_provenance_dict(
        {"variants_only": variants_only})

    if variants_only:
        logging.info("Using lossy variants-only compression")
        # Reduce to site topology. Note that we will remove
        # any sites, individuals and populations here that have no references.
        ts = ts.simplify(reduce_to_site_topology=True)

    tables = ts.tables

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # When using a zipfile in Zarr we get some harmless warnings. See
        # https://zarr.readthedocs.io/en/stable/api/storage.html#zarr.storage.ZipStore
        root.attrs["format_name"] = FORMAT_NAME
        root.attrs["format_version"] = FORMAT_VERSION
        root.attrs["sequence_length"] = tables.sequence_length
        root.attrs["provenance"] = provenance_dict

    columns = {}
    for key, value in tables.asdict().items():
        if isinstance(value, dict):
            for sub_key, sub_value in value.items():
                columns[f"{key}/{sub_key}"] = sub_value
        else:
            columns[key] = value

    if variants_only:
        time = np.unique(tables.nodes.time)
        columns["node/time"] = np.searchsorted(time, tables.nodes.time)

    # Encoding array is a tuple so must be converted
    columns["encoding_version"] = np.asarray(columns["encoding_version"])

    # Sequence length is stored as an attr for compatibility with older versions of tszip
    del columns["sequence_length"]

    # Schemas, metadata and units need to be converted to arrays
    for name in columns:
        if name.endswith("metadata_schema") or name in [
                "time_units",
                "reference_sequence/data",
                "reference_sequence/url",
        ]:
            columns[name] = np.frombuffer(columns[name].encode("utf-8"),
                                          np.int8)
        if name.endswith("metadata"):
            columns[name] = np.frombuffer(columns[name], np.int8)

    # Some columns benefit from being quantised
    coordinates = np.unique(
        np.hstack([
            [0, ts.sequence_length],
            tables.edges.left,
            tables.edges.right,
            tables.sites.position,
            tables.migrations.left,
            tables.migrations.right,
        ]))
    columns["coordinates"] = coordinates
    for name in [
            "edges/left",
            "edges/right",
            "migrations/left",
            "migrations/right",
            "sites/position",
    ]:
        columns[name] = np.searchsorted(coordinates, columns[name])

    # Some columns benefit from additional options
    delta_filter_cols = ["edges/parent", "sites/position"]

    # Note: we're not providing any options to set this here because Blosc+Zstd seems to
    # have a clear advantage in compression performance and speed. There is very little
    # difference between compression level 6 and 9, and it's extremely fast in any case
    # so there's no point in adding complexity. The shuffle filter in particular makes
    # big difference.
    compressor = numcodecs.Blosc(cname="zstd",
                                 clevel=9,
                                 shuffle=numcodecs.Blosc.SHUFFLE)
    for name, data in columns.items():
        Column(name,
               data,
               delta_filter="_offset" in name
               or name in delta_filter_cols).compress(root, compressor)
示例#21
0
    def _ensure_datasets_exist(self, volume_config):
        dtype = volume_config["zarr"]["creation-settings"]["dtype"]
        create_if_necessary = volume_config["zarr"]["create-if-necessary"]
        writable = volume_config["zarr"]["writable"]
        if writable is None:
            writable = create_if_necessary

        mode = 'r'
        if writable:
            mode = 'a'
        self._filemode = mode

        block_shape = volume_config["zarr"]["creation-settings"]["chunk-shape"][::-1]

        global_offset = volume_config["zarr"]["global-offset"][::-1]
        bounding_box_zyx = np.array(volume_config["geometry"]["bounding-box"])[:,::-1]
        creation_shape = np.array(volume_config["zarr"]["creation-settings"]["shape"][::-1])
        replace_default_entries(creation_shape, bounding_box_zyx[1] - global_offset)

        compression = volume_config["zarr"]["creation-settings"]["compression"]
        if compression == 'gzip':
            compressor = numcodecs.GZip()
        elif compression.startswith('blosc-'):
            cname = compression[len('blosc-'):]
            compressor = numcodecs.Blosc(cname)
        else:
            assert compression == "", f"Unimplemented compression: {compression}"

        if create_if_necessary:
            max_scale = volume_config["zarr"]["creation-settings"]["max-scale"]
            if max_scale == -1:
                if -1 in creation_shape:
                    raise RuntimeError("Can't auto-determine the appropriate max-scale to create "
                                       "(or extend) the data with, because you didn't specify a "
                                       "volume creation shape (or bounding box")
                max_scale = choose_pyramid_depth(creation_shape, 512)

            available_scales = [*range(1+max_scale)]
        else:
            available_scales = volume_config["geometry"]["available-scales"]

            if not os.path.exists(self._path):
                raise RuntimeError(f"File does not exist: {self._path}\n"
                                   "You did not specify 'create-if-necessary' in the config, so I won't create it.:\n")

            if self._dataset_name and not os.path.exists(f"{self._path}/{self._dataset_name}"):
                raise RuntimeError(f"File does not exist: {self._path}/{self._dataset_name}\n"
                                   "You did not specify 'create-if-necessary' in the config, so I won't create it.:\n")

        for scale in available_scales:
            if scale == 0:
                name = self._dataset_name
            else:
                name = self._dataset_name[:-1] + f'{scale}'

            if name not in self.zarr_file:
                if not writable:
                    raise RuntimeError(f"Dataset for scale {scale} does not exist, and you "
                                       "didn't specify 'writable' in the config, so I won't create it.")

                if dtype == "auto":
                    raise RuntimeError(f"Can't create Zarr array {self._path}/{self._dataset_name}: "
                                       "No dtype specified in the config.")

                # Use 128 if the user didn't specify a chunkshape
                replace_default_entries(block_shape, 3*[128])

                # zarr misbehaves if the chunks are larger than the shape,
                # which could happen here if we aren't careful (for higher scales).
                scaled_shape = (creation_shape // (2**scale))
                chunks = np.minimum(scaled_shape, block_shape).tolist()
                if (chunks != block_shape) and (scale == 0):
                    logger.warning(f"Block shape ({block_shape}) is too small for "
                                   f"the dataset shape ({creation_shape}). Shrinking block shape.")

                self._zarr_datasets[scale] = self.zarr_file.create_dataset( name,
                                                                            shape=scaled_shape.tolist(),
                                                                            dtype=np.dtype(dtype),
                                                                            chunks=chunks,
                                                                            compressor=compressor )
示例#22
0
def compress_zarr(ts, root, variants_only=False):

    provenance_dict = provenance.get_provenance_dict({"variants_only": variants_only})

    if variants_only:
        logging.info("Using lossy variants-only compression")
        # Reduce to site topology and quantise node times. Note that we will remove
        # any sites, individuals and populations here that have no references.
        ts = ts.simplify(reduce_to_site_topology=True)
        tables = ts.tables
        time = np.unique(tables.nodes.time)
        node_time = np.searchsorted(time, tables.nodes.time)
    else:
        tables = ts.tables
        node_time = tables.nodes.time

    coordinates = np.unique(np.hstack([
        [0, ts.sequence_length], tables.edges.left, tables.edges.right,
        tables.sites.position, tables.migrations.left, tables.migrations.right]))

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # When using a zipfile in Zarr we get some harmless warnings. See
        # https://zarr.readthedocs.io/en/stable/api/storage.html#zarr.storage.ZipStore
        root.attrs["format_name"] = FORMAT_NAME
        root.attrs["format_version"] = FORMAT_VERSION
        root.attrs["sequence_length"] = tables.sequence_length
        root.attrs["provenance"] = provenance_dict

    columns = [
        Column("coordinates", coordinates),
        Column("individuals/flags", tables.individuals.flags),
        Column("individuals/location", tables.individuals.location),
        Column(
            "individuals/location_offset", tables.individuals.location_offset,
            delta_filter=True),
        Column("individuals/metadata", tables.individuals.metadata),
        Column(
            "individuals/metadata_offset", tables.individuals.metadata_offset,
            delta_filter=True),

        Column("nodes/time", node_time),
        Column("nodes/flags", tables.nodes.flags),
        Column("nodes/population", tables.nodes.population),
        Column("nodes/individual", tables.nodes.individual),
        Column("nodes/metadata", tables.nodes.metadata),
        Column(
            "nodes/metadata_offset", tables.nodes.metadata_offset, delta_filter=True),

        # Delta filtering makes storage slightly worse for everything except parent.
        Column("edges/left", np.searchsorted(coordinates, tables.edges.left)),
        Column("edges/right", np.searchsorted(coordinates, tables.edges.right)),
        Column("edges/parent", tables.edges.parent, delta_filter=True),
        Column("edges/child", tables.edges.child),

        Column("migrations/left", np.searchsorted(coordinates, tables.migrations.left)),
        Column(
            "migrations/right", np.searchsorted(coordinates, tables.migrations.right)),
        Column("migrations/node", tables.migrations.node),
        Column("migrations/source", tables.migrations.source),
        Column("migrations/dest", tables.migrations.dest),
        Column("migrations/time", tables.migrations.time),

        Column(
            "sites/position", np.searchsorted(coordinates, tables.sites.position),
            delta_filter=True),
        Column("sites/ancestral_state", tables.sites.ancestral_state),
        Column("sites/ancestral_state_offset", tables.sites.ancestral_state_offset),
        Column("sites/metadata", tables.sites.metadata),
        Column("sites/metadata_offset", tables.sites.metadata_offset),

        Column("mutations/site", tables.mutations.site),
        Column("mutations/node", tables.mutations.node),
        Column("mutations/parent", tables.mutations.parent),
        Column("mutations/derived_state", tables.mutations.derived_state),
        Column("mutations/derived_state_offset", tables.mutations.derived_state_offset),
        Column("mutations/metadata", tables.mutations.metadata),
        Column("mutations/metadata_offset", tables.mutations.metadata_offset),

        Column("populations/metadata", tables.populations.metadata),
        Column("populations/metadata_offset", tables.populations.metadata_offset),

        Column("provenances/timestamp", tables.provenances.timestamp),
        Column("provenances/timestamp_offset", tables.provenances.timestamp_offset),
        Column("provenances/record", tables.provenances.record),
        Column("provenances/record_offset", tables.provenances.record_offset),
    ]

    # Note: we're not providing any options to set this here because Blosc+Zstd seems to
    # have a clear advantage in compression performance and speed. There is very little
    # difference between compression level 6 and 9, and it's extremely fast in any case
    # so there's no point in adding complexity. The shuffle filter in particular makes
    # big difference.
    compressor = numcodecs.Blosc(cname='zstd', clevel=9, shuffle=numcodecs.Blosc.SHUFFLE)
    for column in columns:
        column.compress(root, compressor)
示例#23
0
# Create and fill a caterva array using a block iterator
t0 = time()
a = cat.empty(shape, chunkshape=chunkshape, blockshape=blockshape,
              dtype=content.dtype, filename=fname_cat,
              cname=cname, clevel=clevel, filters=[filter], nthreads=nthreads)
for block, info in a.iter_write():
    block[:] = content[info.slice]
acratio = a.cratio
if persistent:
    del a
t1 = time()
print("Time for filling array (caterva, iter): %.3fs ; CRatio: %.1fx" % ((t1 - t0), acratio))

# Create and fill a zarr array
t0 = time()
compressor = numcodecs.Blosc(cname=cname, clevel=clevel, shuffle=filter, blocksize=blocksize)
numcodecs.blosc.set_nthreads(nthreads)
if persistent:
    z = zarr.open(fname_zarr, mode='w', shape=shape, chunks=chunkshape, dtype=dtype, compressor=compressor)
else:
    z = zarr.empty(shape=shape, chunks=chunkshape, dtype=dtype, compressor=compressor)
z[:] = content
zratio = z.nbytes / z.nbytes_stored
if persistent:
    del z
t1 = time()
print("Time for filling array (zarr): %.3fs ; CRatio: %.1fx" % ((t1 - t0), zratio))

# Create and fill a hdf5 array
t0 = time()
filters = tables.Filters(complevel=clevel, complib="blosc:%s" % cname, shuffle=True)
示例#24
0
import zarr


def write_n5(path, shape, block_size, compressor):
  store = zarr.N5Store(path)
  data = np.arange(np.prod(shape), dtype=np.uint16)
  data = data.reshape(shape)
  data_transpose = data.transpose()
  z = zarr.zeros(
      data_transpose.shape,
      chunks=block_size[::-1],
      store=store,
      dtype=data.dtype,
      overwrite=True,
      compressor=compressor)
  z[...] = data_transpose


write_n5(path='raw', shape=[5, 4], block_size=[3, 2], compressor=None)
write_n5(
    path='gzip', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.GZip())
write_n5(
    path='bzip2', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.BZ2())
write_n5(
    path='xz',
    shape=[5, 4],
    block_size=[3, 2],
    compressor=numcodecs.LZMA(preset=4))
write_n5(
    path='blosc', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.Blosc())
示例#25
0
# 'variants/MEND',
# 'variants/MLEN',
# 'variants/MSTART',
# 'variants/SVLEN',
# 'variants/SVTYPE',
# 'variants/TSD',
# 'variants/AC',
# 'variants/AF',
# 'variants/NS',
# 'variants/AN',
# 'variants/EAS_AF',
# 'variants/EUR_AF',
# 'variants/AFR_AF',
# 'variants/AMR_AF',
# 'variants/SAS_AF',
# 'variants/DP',
# 'variants/AA',
# 'variants/VT',
# 'variants/EX_TARGET',
# 'variants/MULTI_ALLELIC']

# test_fields += ['variants/numalt', 'variants/svlen', 'variants/is_snp']
test_fields += ['variants/numalt','variants/is_snp', 'variants/svlen']


# test_fields = ['variants/*']

ska.vcf_to_zarr(vcf_file, vcf_file.replace('.vcf.gz', '.zarr'),
                  fields=test_fields, alt_number=8, overwrite=True,
                  compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False))
示例#26
0
async def getStorBytes(app, key, filter_ops=None, offset=0, length=-1, bucket=None, use_proxy=False):
    """ Get object identified by key and read as bytes
    """

    client = _getStorageClient(app)
    if not bucket:
        bucket = app['bucket_name']
    if key[0] == '/':
        key = key[1:]  # no leading slash
    if offset is None:
        offset = 0
    if length is None:
        length = 0
    log.info(f"getStorBytes({bucket}/{key}, offset={offset}, length: {length})")

    data_cache_page_size = int(config.get("data_cache_page_size"))

    shuffle = 0
    compressor = None
    if filter_ops:
        log.debug(f"getStorBytes for {key} with filter_ops: {filter_ops}")
        if "use_shuffle" in filter_ops and filter_ops['use_shuffle']:
            shuffle = filter_ops['item_size']
            log.debug("using shuffle filter")
        if "compressor" in filter_ops:
            compressor = filter_ops["compressor"]
            log.debug(f"using compressor: {compressor}")

    if offset > 0 and use_proxy and length < data_cache_page_size:
        # use rangeget proxy
        data = await rangegetProxy(app, bucket=bucket, key=key, offset=offset, length=length)
    else:
        data = await client.get_object(bucket=bucket, key=key, offset=offset, length=length)
    if data is None or len(data) == 0:
        log.info(f"no data found for {key}")
        return data

    log.info(f"read: {len(data)} bytes for key: {key}")
    if length > 0 and len(data) != length:
        log.warn(f"requested {length} bytes but got {len(data)} bytes")
    if compressor:

        # compressed chunk data...

        # first check if this was compressed with blosc
        blosc_metainfo = codecs.blosc.cbuffer_metainfo(data) # returns typesize, isshuffle, and memcopied 
        if blosc_metainfo[0] > 0:      
            log.info(f"blosc compressed data for {key}") 
            try:
                blosc = codecs.Blosc()
                udata = blosc.decode(data)
                log.info(f"uncompressed to {len(udata)} bytes")
                data = udata
                shuffle = 0 # blosc will unshuffle the bytes for us
            except Exception as e:
                log.error(f"got exception: {e} using blosc decompression for {key}")
                raise HTTPInternalServerError()
        elif compressor == "zlib":
            # data may have been compressed without blosc, try using zlib directly
            log.info(f"using zlib to decompress {key}")
            try:
                udata = zlib.decompress(data)
                log.info(f"uncompressed to {len(udata)} bytes")
                data = udata
            except zlib.error as zlib_error:
                log.info(f"zlib_err: {zlib_error}")
                log.error(f"unable to uncompress obj: {key}")
                raise HTTPInternalServerError()
        else:
            log.error(f"don't know how to decompress data in {compressor} format for {key}")
            raise HTTPInternalServerError()
    
    if shuffle > 0:
        log.debug(f"shuffle is {shuffle}")
        start_time = time.time()
        unshuffled = _unshuffle(shuffle, data)
        if unshuffled is not None:
            log.debug(f"unshuffled to {len(unshuffled)} bytes")
            data = unshuffled
        finish_time = time.time()
        log.debug(f"unshuffled {len(data)} bytes, {(finish_time - start_time):.2f} elapsed")
        

    return data
示例#27
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import sys

import numcodecs as codecs
from numcodecs import blosc
import numpy as np
from numpy.testing import assert_array_equal

codec = codecs.Blosc()
data = np.arange(int(sys.argv[1]))
for i in range(int(sys.argv[2])):
    enc = codec.encode(data)
    dec = codec.decode(enc)
    arr = np.frombuffer(dec, dtype=data.dtype)
    assert_array_equal(data, arr)
示例#28
0
def calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath, outpath):
    """Calculate stats from a VCF file."""
    # if reuse_zarr is true
    if zarrpath.exists():
        zarrfile = zarrpath
    else:
        zarrfile = zarrpath
        allel.vcf_to_zarr(str(vcfpath),
                          str(zarrpath),
                          group=chrom,
                          fields='*',
                          alt_number=2,
                          log=sys.stdout,
                          compressor=numcodecs.Blosc(cname='zstd',
                                                     clevel=1,
                                                     shuffle=False))

    # load pop info
    panel = pd.read_csv(pops, sep='\t', usecols=['sampleID', 'population'])

    # load zarr
    callset = zarr.open_group(str(zarrfile), mode='r')
    samples = callset[f'{chrom}/samples'][:]
    samples_list = list(samples)
    samples_callset_index = [samples_list.index(s) for s in panel['sampleID']]
    panel['callset_index'] = samples_callset_index
    panel = panel.sort_values(by='callset_index')

    # load gt
    pos = allel.SortedIndex(callset[f'{chrom}/variants/POS'])
    gt = allel.GenotypeArray(callset[f'{chrom}/calldata/GT'])

    # separate gt for each population
    ix_s = 0
    pop_dt = {}
    pop_ix = []
    for i, p in enumerate(panel["population"].unique()):
        p_ix = panel[panel["population"] == p]["callset_index"].values
        ix_e = len(p_ix) * 2 + ix_s
        pop_ix.append(list(range(ix_s, ix_e)))
        pop_dt[p] = gt.take(p_ix, axis=1).to_haplotypes()
        ix_s = ix_e

    # combine and transpose
    haps = np.concatenate(list(pop_dt.values()), axis=1).T

    # prep progress bar
    ln_count = 0
    with open(coord_bed, 'r') as cb:
        for line in cb:
            if not line.startswith("chrom"):
                ln_count += 1

    progressbar = tqdm(total=ln_count, desc="window numb", unit='window')

    # update stats_dt
    stats_dt["num_haps"] = haps.shape[0]
    stats_dt["pop_config"] = pop_ix
    stats_dt["length_bp"] = int(
        line.split()[-1])  # may be shorter than expected due to last window
    stats_dt["reps"] = ln_count

    # write headers
    outfile = outpath.parent / f"{outpath.stem}.Obs.pop_stats.txt"
    pops_outfile = open(outfile, 'w')
    pops_outfile, header_, header_ls = headers(pops_outfile,
                                               stats_dt,
                                               pop_names=list(pop_dt.keys()),
                                               obs=True)

    # calc stats
    # TODO: parallel
    chrom_ls = []
    i = 0
    stat_mat = np.zeros([ln_count, len(header_ls) - 1])
    with open(coord_bed, 'r') as cb:
        for line in cb:
            if not line.startswith("chrom"):
                cb_lin = line.split()
                chrom = cb_lin[0]
                chrom_ls.append(chrom)
                start = int(cb_lin[1])
                stop = int(cb_lin[2])
                len_bp = stop - start
                stats_dt["length_bp"] = len_bp
                sites = int(cb_lin[3])
                try:
                    pos_ix = pos.locate_range(start, stop)
                except KeyError:
                    continue
                pos_t = pos[pos_ix] - start
                haps_t = haps[:, pos_ix]
                counts_t = haps_t.sum(axis=0).astype(int)
                # run stats
                stats_ls = [start, stop, sites]
                popsumstats = PopSumStats(pos_t, haps_t, counts_t, stats_dt)
                for stat in stats_dt["calc_stats"]:
                    stat_fx = getattr(popsumstats, stat)
                    try:
                        ss = stat_fx()
                        # print(f"{stat} =  {len(ss)}")
                    except IndexError:
                        ss = [np.nan] * len(stats_dt["pw_quants"])
                    stats_ls.extend(ss)
                try:
                    stat_mat[i, :] = stats_ls
                    i += 1
                    progressbar.update()
                except ValueError:
                    continue
    # write stats out
    stat_mean = np.round(np.nanmean(stat_mat, axis=0), 5)
    stats_str = "\t".join(map(str, stat_mean[3:]))
    pops_outfile.write(
        f"mean_{chrom}\t{int(stat_mat[0, 0])}\t{stop}\t{np.sum(stat_mat[:, 2])}\t{stats_str}\n"
    )
    for stat in range(stat_mat.shape[0]):
        chrom = chrom_ls[stat]
        start = int(stat_mat[stat, 0])
        stop = int(stat_mat[stat, 1])
        sites = int(stat_mat[stat, 2])
        rd = [round(num, 5) for num in stat_mat[stat, 3:]]
        stats_str = "\t".join(map(str, rd))
        pops_outfile.write(f"{chrom}\t{start}\t{stop}\t{sites}\t{stats_str}\n")
    progressbar.close()
    pops_outfile.close()

    return outfile
示例#29
0
def compress_zarr(ts, root):
    # TODO this current version is the most extreme option where we throw away
    # all the non-site information.

    # First reduce to site topology
    tables = ts.dump_tables()
    tables.simplify(reduce_to_site_topology=True)

    nodes = root.create_group("nodes")
    flags = nodes.empty("flags", shape=len(tables.nodes), dtype=np.uint8)
    flags[:] = tables.nodes.flags
    logger.debug(flags.info)

    # Get the indexes into the position array.
    pos_map = np.hstack([tables.sites.position, [tables.sequence_length]])
    pos_map[0] = 0
    left_mapped = np.searchsorted(pos_map, tables.edges.left)
    if np.any(pos_map[left_mapped] != tables.edges.left):
        raise ValueError("Invalid left coordinates")
    right_mapped = np.searchsorted(pos_map, tables.edges.right)
    if np.any(pos_map[right_mapped] != tables.edges.right):
        raise ValueError("Invalid right coordinates")

    filters = [numcodecs.Delta(dtype=np.int32, astype=np.int32)]
    compressor = numcodecs.Blosc(cname='zstd',
                                 clevel=9,
                                 shuffle=numcodecs.Blosc.SHUFFLE)
    edges = root.create_group("edges")
    parent = edges.empty("parent",
                         shape=len(tables.edges),
                         dtype=np.int32,
                         filters=filters,
                         compressor=compressor)
    child = edges.empty("child",
                        shape=len(tables.edges),
                        dtype=np.int32,
                        filters=filters,
                        compressor=compressor)
    left = edges.empty("left",
                       shape=len(tables.edges),
                       dtype=np.uint32,
                       filters=filters,
                       compressor=compressor)
    right = edges.empty("right",
                        shape=len(tables.edges),
                        dtype=np.uint32,
                        filters=filters,
                        compressor=compressor)
    parent[:] = tables.edges.parent
    child[:] = tables.edges.child
    left[:] = left_mapped
    right[:] = right_mapped

    mutations = root.create_group("mutations")
    site = mutations.empty("site",
                           shape=len(tables.mutations),
                           dtype=np.int32,
                           compressor=compressor)
    node = mutations.empty("node",
                           shape=len(tables.mutations),
                           dtype=np.int32,
                           compressor=compressor)
    site[:] = tables.mutations.site
    node[:] = tables.mutations.node
示例#30
0
import zarr
import numcodecs
from skimage.data import astronaut

# choose chunks s.t. we do have overhanging edge-chunks
CHUNKS = (100, 100, 1)
STR_TO_COMPRESSOR = {
    'gzip': numcodecs.GZip(),
    'blosc': numcodecs.Blosc(),
    'zlib': numcodecs.Zlib()
}


def generate_zarr_format(compressors=['gzip', 'blosc', 'zlib', None]):
    path = '../data/zarr.zr'
    im = astronaut()

    f = zarr.open(path)
    for compressor in compressors:
        name = compressor if compressor is not None else 'raw'
        compressor_impl = STR_TO_COMPRESSOR[
            compressor] if compressor is not None else None
        f.create_dataset(name,
                         data=im,
                         chunks=CHUNKS,
                         compressor=compressor_impl)


# this needs PR https://github.com/zarr-developers/zarr/pull/309
def generate_n5_format(compressors=['gzip', None]):
    path = '../data/zarr.n5'