Пример #1
0
    def _write(refs, outpath, filetype=None):
        types = {
            "json": "json",
            "parquet": "parquet",
            "zarr": "zarr"
        }
        if filetype is None:
            ext = os.path.splitext(outpath)[1].lstrip(".")
            filetype = types[ext]
        elif filetype not in types:
            raise KeyError
        if filetype == "json":
            with open(outpath, "w") as f:
                json.dump(refs, f)
            return
        import pandas as pd
        references2 = {
            k: {"data": v.encode('ascii') if not isinstance(v, list) else None,
                "url": v[0] if isinstance(v, list) else None,
                "offset": v[1] if isinstance(v, list) else None,
                "size": v[2] if isinstance(v, list) else None}
            for k, v in refs['refs'].items()}
        # use pandas for sorting
        df = pd.DataFrame(references2.values(), index=list(references2)).sort_values("offset")

        if filetype == "zarr":
            import zarr
            import numcodecs
            # compression should be NONE, if intent is to store in single zip
            g = zarr.open_group(outpath, mode='w')
            g.attrs.update({k: v for k, v in refs.items() if k in ['version', "templates", "gen"]})
            g.array(name="key", data=df.index.values, dtype="object", compression="zstd",
                    object_codec=numcodecs.VLenUTF8())
            g.array(name="offset", data=df.offset.values, dtype="uint32", compression="zstd")
            g.array(name="size", data=df['size'].values, dtype="uint32", compression="zstd")
            g.array(name="data", data=df.url.values, dtype="object",
                    object_codec=numcodecs.VLenUTF8(), compression="gzip")
            # may be better as fixed length
            g.array(name="url", data=df.url.values, dtype="object",
                    object_codec=numcodecs.VLenUTF8(), compression='gzip')
        if filetype == "parquet":
            import fastparquet
            metadata = json.dumps(
                {k: v for k, v in refs.items() if k in ['version', "templates", "gen"]}
            )
            fastparquet.write(
                outpath,
                custom_metadata=metadata,
                compression="ZSTD"
            )
Пример #2
0
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
    if series.dtype == object:
        group.create_dataset(
            key,
            shape=series.shape,
            dtype=object,
            object_codec=numcodecs.VLenUTF8(),
            **dataset_kwargs,
        )
        group[key][:] = series.values
    elif is_categorical_dtype(series):
        # This should work for categorical Index and Series
        categorical: pd.Categorical = series.values
        categories: np.ndarray = categorical.categories.values
        codes: np.ndarray = categorical.codes
        category_key = f"__categories/{key}"

        write_array(group,
                    category_key,
                    categories,
                    dataset_kwargs=dataset_kwargs)
        write_array(group, key, codes, dataset_kwargs=dataset_kwargs)

        group[key].attrs["categories"] = category_key
        # Must coerce np.bool_ to bool for json writing
        group[category_key].attrs["ordered"] = bool(categorical.ordered)
    else:
        group[key] = series.values
Пример #3
0
    def table(self, data, names=None, expectedlen=None, **kwargs):

        # setup
        names, columns = _util.check_table_like(data, names=names)
        kwargs = self._set_defaults(kwargs)
        chunks = kwargs.pop('chunks', None)
        g = zarr.group(**kwargs)

        # create columns
        for n, c in zip(names, columns):
            if chunks is None:
                chunks = default_chunks(c, expectedlen)
            if c.dtype == object:
                # peek at first value
                peek = c[0]
                if isinstance(peek, bytes):
                    object_codec = numcodecs.VLenBytes()
                elif isinstance(peek, str):
                    object_codec = numcodecs.VLenUTF8()
                else:
                    object_codec = numcodecs.MsgPack()
            else:
                object_codec = None
            g.array(name=n, data=c, chunks=chunks, object_codec=object_codec)

        # create table
        ztbl = ZarrTable(g, names=names)
        return ztbl
Пример #4
0
    def array(self, data, expectedlen=None, **kwargs):

        # setup
        data = _util.ensure_array_like(data)
        kwargs = self._set_defaults(kwargs)

        # determine chunks
        kwargs.setdefault('chunks', default_chunks(data, expectedlen))

        # determine object codec
        if data.dtype == object:
            # peek at first value
            peek = data[0]
            if isinstance(peek, bytes):
                object_codec = numcodecs.VLenBytes()
            elif isinstance(peek, str):
                object_codec = numcodecs.VLenUTF8()
            else:
                object_codec = numcodecs.MsgPack()
            kwargs.setdefault('object_codec', object_codec)

        # create
        z = zarr.array(data, **kwargs)

        return z
Пример #5
0
def prepare_zarr_storage(variations, out_path):
    store = zarr.DirectoryStore(str(out_path))
    root = zarr.group(store=store, overwrite=True)
    metadata = variations.metadata
    sources = []
    targets = []

    samples_array = variations.samples
    #samples_array.compute_chunk_sizes()
    sources.append(samples_array)

    object_codec = None
    if samples_array.dtype == object:
        object_codec = numcodecs.VLenUTF8()

    dataset = zarr.create(shape=samples_array.shape, path='samples', store=store,
                          dtype=samples_array.dtype, object_codec=object_codec)
    targets.append(dataset)

    variants = root.create_group(ZARR_VARIANTS_GROUP_NAME, overwrite=True)
    calls = root.create_group(ZARR_CALL_GROUP_NAME, overwrite=True)
    for field, array in variations.items():
        definition = ALLELE_ZARR_DEFINITION_MAPPINGS[field]

        field_metadata = metadata.get(field, None)
        array = variations[field]
        if array is None:
            continue
        array.compute_chunk_sizes()
        sources.append(array)

        group_name = definition['group']
        group = calls if group_name == ZARR_CALL_GROUP_NAME else variants
        path = os.path.sep + os.path.join(group.path, definition['field'])

        object_codec = None
        if array.dtype == object:
            object_codec = numcodecs.VLenUTF8()
        dataset = zarr.create(shape=array.shape, path=path, store=store,
                              object_codec=object_codec, dtype=array.dtype)
        if field_metadata is not None:
            for key, value in field_metadata.items():
                dataset.attrs[key] = value

        targets.append(dataset)
        lock = SerializableLock()
    return da.store(sources, targets, compute=False, lock=lock)
Пример #6
0
def extract_splits():
    """Extracts splits to ${DATASETS_DIR}/ucf101/splits.zarr."""
    f = zarr.open(SPLITS_FINAL_DIR, 'w')
    for split in (1, 2, 3):
        g = f.create_group(str(split))
        for subset in ('train', 'test'):
            names = load_split(SPLITS_DIR, split, subset)
            g.create_dataset(subset,
                             data=names,
                             dtype=object,
                             object_codec=numcodecs.VLenUTF8())
    print(f'Splits saved to {SPLITS_FINAL_DIR}')
Пример #7
0
def write_array(g, key, value, dataset_kwargs={}):
    if value.dtype == object:
        g.create_dataset(
            key,
            shape=value.shape,
            dtype=object,
            object_codec=numcodecs.VLenUTF8(),
            **dataset_kwargs,
        )
        g[key][:] = value
    else:
        g.create_dataset(key, data=value, **dataset_kwargs)
Пример #8
0
def write_vlen_string_array_zarr(f,
                                 k,
                                 elem,
                                 dataset_kwargs=MappingProxyType({})):
    import numcodecs

    f.create_dataset(
        k,
        shape=elem.shape,
        dtype=object,
        object_codec=numcodecs.VLenUTF8(),
        **dataset_kwargs,
    )
    f[k][:] = elem
Пример #9
0
def write_array(g, key, value, dataset_kwargs=MappingProxyType({})):
    if value.dtype == object:
        g.create_dataset(
            key,
            shape=value.shape,
            dtype=object,
            object_codec=numcodecs.VLenUTF8(),
            **dataset_kwargs,
        )
        g[key][:] = value
    elif value.dtype.kind == "V":
        # Structured dtype
        g.create_dataset(key, data=_to_fixed_length_strings(value), **dataset_kwargs)
    else:
        g.create_dataset(key, data=value, **dataset_kwargs)
Пример #10
0
def write_series(g, k, s, dataset_kwargs={}):
    if s.dtype == object:
        g.create_dataset(
            k,
            shape=s.shape,
            dtype=object,
            object_codec=numcodecs.VLenUTF8(),
            **dataset_kwargs,
        )
        g[k][:] = s.values
    elif is_categorical_dtype(s):
        g.create_dataset(k, shape=s.shape, dtype=s.cat.codes.dtype)
        g[k][:] = s.cat.codes
        g[k].attrs["categories"] = list(s.cat.categories)
    else:
        g[k] = s.values
Пример #11
0
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
    if series.dtype == object:
        group.create_dataset(
            key,
            shape=series.shape,
            dtype=object,
            object_codec=numcodecs.VLenUTF8(),
            **dataset_kwargs,
        )
        group[key][:] = series.values
    elif is_categorical_dtype(series):
        cats = series.cat.categories.values
        codes = series.cat.codes.values
        category_key = f"__categories/{key}"

        write_array(group, category_key, cats, dataset_kwargs)
        write_array(group, key, codes, dataset_kwargs)
        group[key].attrs["categories"] = category_key
    else:
        group[key] = series.values
Пример #12
0
def concat_zarrs_optimized(
    zarr_files: Sequence[str],
    output: Union[PathType, MutableMapping[str, bytes]],
    vars_to_rechunk: List[Hashable],
    vars_to_copy: List[Hashable],
    fix_strings: bool = False,
) -> None:
    if isinstance(output, Path):
        output = str(output)

    zarr_groups = [zarr.open_group(f) for f in zarr_files]

    first_zarr_group = zarr_groups[0]

    # create the top-level group
    zarr.open_group(output, mode="w")

    # copy variables that are to be rechunked
    # NOTE: that this uses _to_zarr function defined here that is needed to avoid
    # race conditions between writing the array contents and its metadata
    # see https://github.com/pystatgen/sgkit/pull/486
    delayed = []  # do all the rechunking operations in one computation
    for var in vars_to_rechunk:
        dtype = None
        if fix_strings and var in {"variant_id", "variant_allele"}:
            max_len = _get_max_len(zarr_groups, f"max_length_{var}")
            dtype = f"S{max_len}"
        arr = concatenate_and_rechunk([group[var] for group in zarr_groups],
                                      dtype=dtype)

        _to_zarr_kwargs = dict(
            compressor=first_zarr_group[var].compressor,
            filters=first_zarr_group[var].filters,
            fill_value=None,
        )
        if not fix_strings and arr.dtype == "O":
            # We assume that all object dtypes are variable length strings
            var_len_str_codec = numcodecs.VLenUTF8()
            _to_zarr_kwargs["object_codec"] = var_len_str_codec
            # Remove from filters to avoid double encoding error
            if var_len_str_codec in first_zarr_group[var].filters:
                filters = list(first_zarr_group[var].filters)
                filters.remove(var_len_str_codec)
                _to_zarr_kwargs["filters"] = filters

        d = _to_zarr(  # type: ignore[no-untyped-call]
            arr,
            output,
            component=var,
            overwrite=True,
            compute=False,
            attrs=first_zarr_group[var].attrs.asdict(),
            **_to_zarr_kwargs,
        )
        delayed.append(d)
    da.compute(*delayed)

    # copy unchanged variables and top-level metadata
    with zarr.open_group(output) as output_zarr:

        # copy variables that are not rechunked (e.g. sample_id)
        for var in vars_to_copy:
            output_zarr[var] = first_zarr_group[var]
            output_zarr[var].attrs.update(first_zarr_group[var].attrs)

        # copy top-level attributes
        group_attrs = dict(first_zarr_group.attrs)
        if "max_alt_alleles_seen" in group_attrs:
            max_alt_alleles_seen = _get_max_len(zarr_groups,
                                                "max_alt_alleles_seen")
            group_attrs["max_alt_alleles_seen"] = max_alt_alleles_seen
        output_zarr.attrs.update(group_attrs)

    # consolidate metadata
    zarr.consolidate_metadata(output)
Пример #13
0
 def writeFactorData(self, factor_data, table_name, ifactor_name, if_exists="update", data_type=None):
     if data_type is None: data_type = _identifyDataType(factor_data.dtypes)
     if data_type=='double':
         try:
             factor_data = factor_data.astype('float')
             data_type = 'double'
         except:
             factor_data = factor_data.where(pd.notnull(factor_data), None)
             data_type = 'string'
     else:
         factor_data = factor_data.where(pd.notnull(factor_data), None)
     DTs = factor_data.index
     if pd.__version__>="0.20.0": factor_data.index = [idt.to_pydatetime().timestamp() for idt in factor_data.index]
     else: factor_data.index = [idt.timestamp() for idt in factor_data.index]
     TablePath = self.MainDir+os.sep+table_name
     with self._DataLock:
         if ifactor_name not in self._TableFactorDict.get(table_name, {}):
             self._TableFactorDict[table_name] = self._TableFactorDict.get(table_name, pd.Series()).append(pd.Series(data_type, index=[ifactor_name]))
         ZTable = zarr.open(TablePath, mode="a")
         if ifactor_name not in ZTable:
             ZFactor = ZTable.create_group(ifactor_name, overwrite=True)
             ZFactor.create_dataset("ID", shape=(factor_data.shape[1], ), data=factor_data.columns.values, dtype=object, object_codec=numcodecs.VLenUTF8(), overwrite=True)
             ZFactor.create_dataset("DateTime", shape=(factor_data.shape[0], ), data=factor_data.index.values, dtype="f8", overwrite=True)
             if data_type=="double":
                 ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype="f8", fill_value=np.nan, overwrite=True)
             elif data_type=="string":
                 ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype=object, object_codec=numcodecs.VLenUTF8(), overwrite=True)
             ZFactor.attrs["DataType"] = data_type
             ZTable.attrs["DataType"] = self._TableFactorDict[table_name].to_dict()
             factor_data.index = DTs
             return 0
     if if_exists=="update":
         self._updateFactorData(factor_data, table_name, ifactor_name, data_type)
     elif if_exists=="append":
         OldData = self.getTable(table_name).readFactorData(ifactor_name=ifactor_name, ids=factor_data.columns.tolist(), dts=DTs.tolist())
         OldData.index = factor_data.index
         factor_data = OldData.where(pd.notnull(OldData), factor_data)
         self._updateFactorData(factor_data, table_name, ifactor_name, data_type)
     factor_data.index = DTs
     return 0
Пример #14
0
 def writeFactorData(self,
                     factor_data,
                     table_name,
                     ifactor_name,
                     if_exists="update",
                     data_type=None):
     TablePath = self.MainDir + os.sep + table_name
     with self._DataLock:
         ZTable = zarr.open(TablePath, mode="a")
         if ifactor_name not in ZTable:
             factor_data, data_type = _identifyDataType(
                 factor_data, data_type)
             ZFactor = ZTable.create_group(ifactor_name, overwrite=True)
             ZFactor.create_dataset("ID",
                                    shape=(factor_data.shape[1], ),
                                    data=factor_data.columns.values,
                                    dtype=object,
                                    object_codec=numcodecs.VLenUTF8(),
                                    overwrite=True)
             ZFactor.create_dataset("DateTime",
                                    shape=(factor_data.shape[0], ),
                                    data=factor_data.index.values,
                                    dtype="M8[ns]",
                                    overwrite=True)
             if data_type == "double":
                 ZFactor.create_dataset("Data",
                                        shape=factor_data.shape,
                                        data=factor_data.values,
                                        dtype="f8",
                                        fill_value=np.nan,
                                        overwrite=True)
             elif data_type == "string":
                 ZFactor.create_dataset("Data",
                                        shape=factor_data.shape,
                                        data=factor_data.values,
                                        dtype=object,
                                        object_codec=numcodecs.VLenUTF8(),
                                        overwrite=True)
             elif data_type == "object":
                 ZFactor.create_dataset("Data",
                                        shape=factor_data.shape,
                                        data=factor_data.values,
                                        dtype=object,
                                        object_codec=numcodecs.Pickle(),
                                        overwrite=True)
             ZFactor.attrs["DataType"] = data_type
             DataType = ZTable.attrs.get("DataType", {})
             DataType[ifactor_name] = data_type
             ZTable.attrs["DataType"] = DataType
             return 0
     if if_exists == "update":
         self._updateFactorData(factor_data, table_name, ifactor_name,
                                data_type)
     elif if_exists == "append":
         OldData = self.getTable(table_name).readFactorData(
             ifactor_name=ifactor_name,
             ids=factor_data.columns.tolist(),
             dts=factor_data.index.tolist())
         OldData.index = factor_data.index
         factor_data = OldData.where(pd.notnull(OldData), factor_data)
         self._updateFactorData(factor_data, table_name, ifactor_name,
                                data_type)
     return 0