Python ZstdFilter примеры, tiledb.ZstdFilter Python примеры использования

Пример #1

0

Показать файл

 def create_matrix_array(matrix_name, number_of_rows, number_of_columns,
                         encode_as_sparse_array):
     filters = tiledb.FilterList([tiledb.ZstdFilter()])
     attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
     if encode_as_sparse_array:
         domain = tiledb.Domain(
             tiledb.Dim(name="obs",
                        domain=(0, number_of_rows - 1),
                        tile=min(number_of_rows, 512),
                        dtype=np.uint32),
             tiledb.Dim(name="var",
                        domain=(0, number_of_columns - 1),
                        tile=min(number_of_columns, 2048),
                        dtype=np.uint32),
         )
     else:
         domain = tiledb.Domain(
             tiledb.Dim(name="obs",
                        domain=(0, number_of_rows - 1),
                        tile=min(number_of_rows, 50),
                        dtype=np.uint32),
             tiledb.Dim(name="var",
                        domain=(0, number_of_columns - 1),
                        tile=min(number_of_columns, 100),
                        dtype=np.uint32),
         )
     schema = tiledb.ArraySchema(domain=domain,
                                 sparse=encode_as_sparse_array,
                                 attrs=attrs,
                                 cell_order="row-major",
                                 tile_order="col-major")
     if encode_as_sparse_array:
         tiledb.SparseArray.create(matrix_name, schema)
     else:
         tiledb.DenseArray.create(matrix_name, schema)

Пример #2

0

Показать файл

Файл: sklearn.py Проект: TileDB-Inc/TileDB-ML

    def _create_array(self) -> None:
        """Create a TileDB array for a Sklearn model."""
        dom = tiledb.Domain(
            tiledb.Dim(name="model",
                       domain=(1, 1),
                       tile=1,
                       dtype=np.int32,
                       ctx=self.ctx), )

        attrs = [
            tiledb.Attr(
                name="model_params",
                dtype=bytes,
                var=True,
                filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ctx=self.ctx,
            ),
        ]

        schema = tiledb.ArraySchema(domain=dom,
                                    sparse=False,
                                    attrs=attrs,
                                    ctx=self.ctx)

        tiledb.Array.create(self.uri, schema, ctx=self.ctx)

        # In case we are on TileDB-Cloud we have to update model array's file properties
        if self.namespace:
            update_file_properties(self.uri, self._file_properties)

Пример #3

0

Показать файл

Файл: cxgtool.py Проект: saeedseyyedi/cellxgene

def create_X(X_name, shape, is_sparse):
    """
    The X matrix is accessed in both row and column oriented patterns, depending on the
    particular operation.  Because of the data type, default compression works best.
    The tile size, (50, 100) for dense, and (512,2048) for sparse,
    and global layout (row/col) was chosen empirically, by benchmarking
    the current cellxgene backend.
    """
    filters = tiledb.FilterList([tiledb.ZstdFilter()])
    attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
    if is_sparse:
        domain = tiledb.Domain(
            tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 512), dtype=np.uint32),
            tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 2048), dtype=np.uint32),
        )
    else:
        domain = tiledb.Domain(
            tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 50), dtype=np.uint32),
            tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 100), dtype=np.uint32),
        )
    schema = tiledb.ArraySchema(
        domain=domain, sparse=is_sparse, attrs=attrs, cell_order="row-major", tile_order="col-major"
    )
    if is_sparse:
        tiledb.SparseArray.create(X_name, schema)
    else:
        tiledb.DenseArray.create(X_name, schema)

Пример #4

0

Показать файл

    def _create_array(self):
        """
        Creates a TileDB array for a Tensorflow model
        """
        try:
            dom = tiledb.Domain(
                tiledb.Dim(name="model", domain=(1, 1), tile=1,
                           dtype=np.int32), )

            attrs = [
                tiledb.Attr(
                    name="model_weights",
                    dtype="S1",
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ),
                tiledb.Attr(
                    name="optimizer_weights",
                    dtype="S1",
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ),
            ]

            schema = tiledb.ArraySchema(
                domain=dom,
                sparse=False,
                attrs=attrs,
            )

            tiledb.Array.create(self.uri, schema)
        except tiledb.TileDBError as error:
            if "Error while listing with prefix" in str(error):
                # It is possible to land here if user sets wrong default s3 credentials
                # with respect to default s3 path
                raise HTTPError(
                    code=400,
                    msg=
                    f"Error creating file, {error} Are your S3 credentials valid?",
                )

            if "already exists" in str(error):
                logging.warning(
                    "TileDB array already exists but update=False. "
                    "Next time set update=True. Returning")
                raise error

Пример #5

0

Показать файл

 def create_ndarray_array(ndarray_name, ndarray):
     filters = tiledb.FilterList([tiledb.ZstdFilter()])
     attrs = [tiledb.Attr(dtype=ndarray.dtype, filters=filters)]
     dimensions = [
         tiledb.Dim(
             domain=(0, ndarray.shape[dimension] - 1), tile=min(ndarray.shape[dimension], 1000), dtype=np.uint32
         )
         for dimension in range(ndarray.ndim)
     ]
     domain = tiledb.Domain(*dimensions)
     schema = tiledb.ArraySchema(
         domain=domain, sparse=False, attrs=attrs, capacity=1_000_000, cell_order="row-major", tile_order="row-major"
     )
     tiledb.DenseArray.create(ndarray_name, schema)

Пример #6

0

Показать файл

Файл: cxgtool.py Проект: saeedseyyedi/cellxgene

def create_emb(e_name, emb):
    """
    Embeddings are typically accessed with very large slices (or all of the embedding),
    and do not benefit from overly aggressive compression due to their format.  Given
    this, we use:
    * large tile size (1000)
    * default compression level
    """
    filters = tiledb.FilterList([tiledb.ZstdFilter()])
    attrs = [tiledb.Attr(dtype=emb.dtype, filters=filters)]
    dims = []
    for d in range(emb.ndim):
        shape = emb.shape
        dims.append(tiledb.Dim(domain=(0, shape[d] - 1), tile=min(shape[d], 1000), dtype=np.uint32))
    domain = tiledb.Domain(*dims)
    schema = tiledb.ArraySchema(
        domain=domain, sparse=False, attrs=attrs, capacity=1_000_000, cell_order="row-major", tile_order="row-major"
    )
    tiledb.DenseArray.create(e_name, schema)

Пример #7

0

Показать файл

 def create_dataframe_array(array_name, dataframe):
     tiledb_filter = tiledb.FilterList(
         [
             # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and
             # other non-float data.
             tiledb.ZstdFilter(level=22),
         ]
     )
     attrs = [
         tiledb.Attr(name=column, dtype=get_dtype_of_array(dataframe[column]), filters=tiledb_filter)
         for column in dataframe
     ]
     domain = tiledb.Domain(
         tiledb.Dim(domain=(0, dataframe.shape[0] - 1), tile=min(dataframe.shape[0], 1000), dtype=np.uint32)
     )
     schema = tiledb.ArraySchema(
         domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major"
     )
     tiledb.DenseArray.create(array_name, schema)

Пример #8

0

Показать файл

def create_X(X_name, shape):
    """
    Dense, always.  Future task: explore if sparse encoding is worth the trouble
    below a sparsity threshold.

    The X matrix is access in both row and column oriented patterns, depending on the
    particular operation.  Because of the data type, default compression works best.
    The tile size (50, 100) and global layout (row/col) was choosen empirically, by benchmarking
    the current cellxgene backend.
    """
    filters = tiledb.FilterList([tiledb.ZstdFilter()])
    attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
    domain = tiledb.Domain(
        tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 50), dtype=np.uint32),
        tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 100), dtype=np.uint32),
    )
    schema = tiledb.ArraySchema(
        domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="col-major"
    )
    tiledb.DenseArray.create(X_name, schema)

Пример #9

0

Показать файл

Файл: cxgtool.py Проект: saeedseyyedi/cellxgene

def create_dataframe(name, df, ctx):
    """
    Current access patterns are oriented toward reading very large slices of
    the dataframe, one attribute at a time.  Attribute data also tends to be
    (often) repetitive (bools, categories, strings).
    Given this, we use:
    * a large tile size (1000)
    * very aggressive compression levels
    """
    filter = tiledb.FilterList(
        [
            # attempt aggressive compression as many of these dataframes are very repetitive
            # strings, bools and other non-float data.
            tiledb.ZstdFilter(level=22),
        ]
    )
    attrs = [tiledb.Attr(name=col, dtype=cxg_dtype(df[col]), filters=filter) for col in df]
    domain = tiledb.Domain(tiledb.Dim(domain=(0, df.shape[0] - 1), tile=min(df.shape[0], 1000), dtype=np.uint32))
    schema = tiledb.ArraySchema(
        domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major"
    )
    tiledb.DenseArray.create(name, schema)

Пример #10

0

Показать файл

Файл: pytorch.py Проект: TileDB-Inc/TileDB-ML

    def _create_array(self, serialized_model_info: Mapping[str,
                                                           bytes]) -> None:
        """
        Create a TileDB array for a PyTorch model

        :param serialized_model_info: A mapping with pickled information of a PyTorch model.
        """
        dom = tiledb.Domain(
            tiledb.Dim(name="model",
                       domain=(1, 1),
                       tile=1,
                       dtype=np.int32,
                       ctx=self.ctx), )

        attrs = []

        # Keep model's state dictionary
        attrs.append(
            tiledb.Attr(
                name="model_state_dict",
                dtype=bytes,
                var=True,
                filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ctx=self.ctx,
            ), )

        # If optimizer is provided we also keep optimizer's state dictionary
        if self.optimizer:
            attrs.append(
                tiledb.Attr(
                    name="optimizer_state_dict",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ), )

        # Add extra attributes in case model information is provided by the user
        if serialized_model_info:
            for key in serialized_model_info:
                attrs.append(
                    tiledb.Attr(
                        name=key,
                        dtype=bytes,
                        var=True,
                        filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                        ctx=self.ctx,
                    ), )

        schema = tiledb.ArraySchema(
            domain=dom,
            sparse=False,
            attrs=attrs,
            ctx=self.ctx,
        )

        tiledb.Array.create(self.uri, schema, ctx=self.ctx)

        # In case we are on TileDB-Cloud we have to update model array's file properties
        if self.namespace:
            update_file_properties(self.uri, self._file_properties)

Пример #11

0

Показать файл

Файл: cxgtool.py Проект: saeedseyyedi/cellxgene

def save_X(container, xdata, ctx, sparse_threshold, expect_sparse=False):
    # Save X count matrix
    X_name = f"{container}/X"

    shape = xdata.shape
    log(1, "\t...shape:", str(shape))

    col_shift = None
    if sparse_threshold == 100:
        is_sparse = True
    elif sparse_threshold == 0:
        is_sparse = False
    else:
        is_sparse, nnz, nelem = evaluate_for_sparse_encoding(xdata, sparse_threshold)
        percent = 100.0 * nnz / nelem
        if nelem != shape[0] * shape[1]:
            log(1, "\t...sparse=", is_sparse, "non-zeros percent (estimate): %6.2f" % percent)
        else:
            log(1, "\t...sparse=", is_sparse, "non-zeros:", nnz, "percent: %6.2f" % percent)

        is_sparse = percent < sparse_threshold
        if not is_sparse:
            col_shift, nnz, nelem = evaluate_for_sparse_column_shift_encoding(xdata, sparse_threshold)
            is_sparse = col_shift is not None
            percent = 100.0 * nnz / nelem
            if nelem != shape[0] * shape[1]:
                log(1, "\t...sparse=", is_sparse, "col shift non-zeros percent (estimate): %6.2f" % percent)
            else:
                log(1, "\t...sparse=", is_sparse, "col shift non-zeros:", nnz, "percent: %6.2f" % percent)

    if expect_sparse is True and is_sparse is False:
        return False

    create_X(X_name, shape, is_sparse)
    stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000)
    if is_sparse:
        if col_shift is not None:
            log(1, "\t...output X as sparse matrix with column shift encoding")
            X_col_shift_name = f"{container}/X_col_shift"
            filters = tiledb.FilterList([tiledb.ZstdFilter()])
            attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
            domain = tiledb.Domain(tiledb.Dim(domain=(0, shape[1] - 1), tile=min(shape[1], 5000), dtype=np.uint32))
            schema = tiledb.ArraySchema(domain=domain, attrs=attrs)
            tiledb.DenseArray.create(X_col_shift_name, schema)
            with tiledb.DenseArray(X_col_shift_name, mode="w", ctx=ctx) as X_col_shift:
                X_col_shift[:] = col_shift
            tiledb.consolidate(X_col_shift_name, ctx=ctx)
        else:
            log(1, "\t...output X as sparse matrix")

        with tiledb.SparseArray(X_name, mode="w", ctx=ctx) as X:
            nnz = 0
            for row in range(0, shape[0], stride):
                lim = min(row + stride, shape[0])
                a = xdata[row:lim, :]
                if type(a) is not np.ndarray:
                    a = a.toarray()
                if col_shift is not None:
                    a = a - col_shift
                indices = np.nonzero(a)
                trow = indices[0] + row
                nnz += indices[0].shape[0]
                X[trow, indices[1]] = a[indices[0], indices[1]]
                log(2, "\t...rows", lim, "of", shape[0], "nnz", nnz, "sparse", nnz / (lim * shape[1]))

    else:
        log(1, "\t...output X as dense matrix")
        with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X:
            for row in range(0, shape[0], stride):
                lim = min(row + stride, shape[0])
                a = xdata[row:lim, :]
                if type(a) is not np.ndarray:
                    a = a.toarray()
                X[row:lim, :] = a
                log(2, "\t...rows", row, "to", lim)

    tiledb.consolidate(X_name, ctx=ctx)
    if hasattr(tiledb, "vacuum"):
        tiledb.vacuum(X_name)

    return is_sparse

Пример #12

0

Показать файл

    def _create_array(self, uri, retry=0):
        """
        Create a new array for storing notebook file
        :param uri: location to create array
        :param name: name to register under
        :param retry: number of times to retry request
        :return:
        """
        try:
            # The array will be be 1 dimensional with domain of 0 to max uint64. We use a tile extent of 1024 bytes
            dom = tiledb.Domain(
                tiledb.Dim(
                    name="position",
                    domain=(0, numpy.iinfo(numpy.uint64).max - 1025),
                    tile=1024,
                    dtype=numpy.uint64,
                    ctx=tiledb.cloud.Ctx(),
                ),
                ctx=tiledb.cloud.Ctx(),
            )

            schema = tiledb.ArraySchema(
                domain=dom,
                sparse=True,
                attrs=[
                    tiledb.Attr(
                        name="contents",
                        dtype=numpy.uint8,
                        filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    )
                ],
                ctx=tiledb.cloud.Ctx(),
            )

            parts = uri.split("/")
            parts_len = len(parts)
            namespace = parts[parts_len - 2]
            array_name = parts[parts_len - 1]

            s3_prefix = get_s3_prefix(namespace)
            if s3_prefix is None:
                raise http_error(
                    400,
                    "You must set the default s3 prefix path for notebooks in {} profile settings"
                    .format(namespace),
                )

            tiledb_uri_s3 = "tiledb://{}/{}".format(namespace,
                                                    s3_prefix + array_name)

            # Create the (empty) array on disk.
            tiledb.SparseArray.create(tiledb_uri_s3, schema)

            tiledb_uri = "tiledb://{}/{}".format(namespace, array_name)
            time.sleep(0.25)
            tiledb.cloud.array.update_info(uri=tiledb_uri,
                                           array_name=array_name,
                                           tags=[TAG_JUPYTER_NOTEBOOK])

            return tiledb_uri, array_name
        except tiledb.TileDBError as e:
            if "already exists" in str(e):
                parts = uri.split("/")
                parts_length = len(parts)
                array_name = parts[parts_length - 1]

                array_name = self._increment_filename(array_name)

                parts[parts_length - 1] = array_name
                uri = "/".join(parts)

                return self._create_array(uri, retry)
            elif retry:
                retry -= 1
                return self._create_array(uri, retry)
        except HTTPError as e:
            raise e
        except Exception as e:
            raise http_error(400, "Error creating file %s " % str(e))

        return None

Пример #13

0

Показать файл

    def _create_array(self) -> None:
        """Create a TileDB array for a Tensorflow model"""
        assert self.model
        dom = tiledb.Domain(
            tiledb.Dim(name="model",
                       domain=(1, 1),
                       tile=1,
                       dtype=np.int32,
                       ctx=self.ctx) if
            isinstance(self.model, (Functional, Sequential)) else tiledb.Dim(
                name="model",
                domain=(1, len(self.model.layers)),
                tile=1,
                dtype=np.int32,
                ctx=self.ctx,
            ), )
        if isinstance(self.model, (Functional, Sequential)):
            attrs = [
                tiledb.Attr(
                    name="model_weights",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
                tiledb.Attr(
                    name="optimizer_weights",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
            ]
        else:
            attrs = [
                # String names of weights of each layer of the model
                tiledb.Attr(
                    name="weight_names",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
                # The values of weights of each layer of the model
                tiledb.Attr(
                    name="weight_values",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
                # Layer names TF format of the saved/loaded model
                tiledb.Attr(
                    name="layer_name",
                    dtype=str,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
                # The weight values of the optimizer in case the model is saved compiled
                tiledb.Attr(
                    name="optimizer_weights",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
            ]

        schema = tiledb.ArraySchema(
            domain=dom,
            sparse=False,
            attrs=attrs,
            ctx=self.ctx,
        )

        tiledb.Array.create(self.uri, schema, ctx=self.ctx)

        # In case we are on TileDB-Cloud we have to update model array's file properties
        if self.namespace:
            update_file_properties(self.uri, self._file_properties)

Python ZstdFilter примеры использования