示例#1
0
 def create_matrix_array(matrix_name, number_of_rows, number_of_columns,
                         encode_as_sparse_array):
     filters = tiledb.FilterList([tiledb.ZstdFilter()])
     attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
     if encode_as_sparse_array:
         domain = tiledb.Domain(
             tiledb.Dim(name="obs",
                        domain=(0, number_of_rows - 1),
                        tile=min(number_of_rows, 512),
                        dtype=np.uint32),
             tiledb.Dim(name="var",
                        domain=(0, number_of_columns - 1),
                        tile=min(number_of_columns, 2048),
                        dtype=np.uint32),
         )
     else:
         domain = tiledb.Domain(
             tiledb.Dim(name="obs",
                        domain=(0, number_of_rows - 1),
                        tile=min(number_of_rows, 50),
                        dtype=np.uint32),
             tiledb.Dim(name="var",
                        domain=(0, number_of_columns - 1),
                        tile=min(number_of_columns, 100),
                        dtype=np.uint32),
         )
     schema = tiledb.ArraySchema(domain=domain,
                                 sparse=encode_as_sparse_array,
                                 attrs=attrs,
                                 cell_order="row-major",
                                 tile_order="col-major")
     if encode_as_sparse_array:
         tiledb.SparseArray.create(matrix_name, schema)
     else:
         tiledb.DenseArray.create(matrix_name, schema)
示例#2
0
    def _create_array(self) -> None:
        """Create a TileDB array for a Sklearn model."""
        dom = tiledb.Domain(
            tiledb.Dim(name="model",
                       domain=(1, 1),
                       tile=1,
                       dtype=np.int32,
                       ctx=self.ctx), )

        attrs = [
            tiledb.Attr(
                name="model_params",
                dtype=bytes,
                var=True,
                filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ctx=self.ctx,
            ),
        ]

        schema = tiledb.ArraySchema(domain=dom,
                                    sparse=False,
                                    attrs=attrs,
                                    ctx=self.ctx)

        tiledb.Array.create(self.uri, schema, ctx=self.ctx)

        # In case we are on TileDB-Cloud we have to update model array's file properties
        if self.namespace:
            update_file_properties(self.uri, self._file_properties)
示例#3
0
def create_X(X_name, shape, is_sparse):
    """
    The X matrix is accessed in both row and column oriented patterns, depending on the
    particular operation.  Because of the data type, default compression works best.
    The tile size, (50, 100) for dense, and (512,2048) for sparse,
    and global layout (row/col) was chosen empirically, by benchmarking
    the current cellxgene backend.
    """
    filters = tiledb.FilterList([tiledb.ZstdFilter()])
    attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
    if is_sparse:
        domain = tiledb.Domain(
            tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 512), dtype=np.uint32),
            tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 2048), dtype=np.uint32),
        )
    else:
        domain = tiledb.Domain(
            tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 50), dtype=np.uint32),
            tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 100), dtype=np.uint32),
        )
    schema = tiledb.ArraySchema(
        domain=domain, sparse=is_sparse, attrs=attrs, cell_order="row-major", tile_order="col-major"
    )
    if is_sparse:
        tiledb.SparseArray.create(X_name, schema)
    else:
        tiledb.DenseArray.create(X_name, schema)
示例#4
0
def create_new_array(size,
                     array_out_name,
                     tile_size,
                     attribute_config,
                     compressor='gzip',
                     compression_level=-1):
    '''
    Creates an empty tileDB array
    '''

    tile_size = min(size, tile_size)
    tiledb_dim = tiledb.Dim(name='genome_coordinate',
                            domain=(0, size - 1),
                            tile=tile_size,
                            dtype='uint32')
    tiledb_dom = tiledb.Domain(tiledb_dim, ctx=tdb_Context)

    #generate the attribute information
    attribute_info = get_attribute_info(attribute_config)
    attribs = []
    for key in attribute_info:
        attribs.append(
            tiledb.Attr(name=key,
                        filters=tiledb.FilterList([tiledb.GzipFilter()]),
                        dtype=attribute_info[key]['dtype']))
    tiledb_schema = tiledb.ArraySchema(domain=tiledb_dom,
                                       attrs=tuple(attribs),
                                       cell_order='row-major',
                                       tile_order='row-major')

    tiledb.DenseArray.create(array_out_name, tiledb_schema, ctx=tdb_Context)
    print("created empty array on disk")
    gc.collect()
    return
示例#5
0
    def _parse_single_attr(self, filter_and_options, param, ctx):
        filter_name_to_function = {
            "GzipFilter": tiledb.GzipFilter,
            "ZstdFilter": tiledb.ZstdFilter,
            "LZ4Filter": tiledb.LZ4Filter,
            "Bzip2Filter": tiledb.Bzip2Filter,
            "RleFilter": tiledb.RleFilter,
            "DoubleDeltaFilter": tiledb.DoubleDeltaFilter,
            "BitShuffleFilter": tiledb.BitShuffleFilter,
            "ByteShuffleFilter": tiledb.ByteShuffleFilter,
            "BitWidthReductionFilter": tiledb.BitWidthReductionFilter,
            "PositiveDeltaFilter": tiledb.PositiveDeltaFilter,
        }

        provided_filter_and_options = dict()
        for value in filter_and_options.split(","):
            filter_and_option = value.split("=")

            filter = filter_and_option[0]
            if len(filter_and_option) == 1:
                provided_filter_and_options[filter] = None
            elif len(filter_and_option) == 2:
                try:
                    provided_filter_and_options[filter] = int(
                        filter_and_option[1])
                except ValueError:
                    self.fail(
                        f"{filter_and_option[1]} is not a valid integer "
                        f"for {provided_filter_and_options[filter]}",
                        param,
                        ctx,
                    )
            else:
                self.fail(
                    "Too many arguments provided for "
                    f"{provided_filter_and_options[filter]}",
                    param,
                    ctx,
                )

        bad_filters = set(provided_filter_and_options.keys()) - set(
            filter_name_to_function.keys())
        if bad_filters:
            self.fail(
                f"Saw the following bad <filter names>: {bad_filters}",
                param,
                ctx,
            )

        filter_list = []
        for filter_name in provided_filter_and_options:
            filter_function = filter_name_to_function[filter_name]
            filter_option = provided_filter_and_options[filter_name]
            if filter_option is None:
                filter_list.append(filter_function())
            else:
                filter_list.append(filter_function(filter_option))

        return tiledb.FilterList(filter_list)
示例#6
0
def test_tiledb_test():
    import tiledb

    n = 1000
    m = 1000
    num_vals = 1000

    n_idxs = np.sort(np.random.choice(n, num_vals, replace=False))
    m_idxs = np.sort(np.random.choice(m, num_vals, replace=False))
    values = np.random.randint(0, 100, num_vals, np.uint8)

    ctx = tiledb.Ctx()

    n_tile_extent = min(100, n)

    d1 = tiledb.Dim("ndom",
                    domain=(0, n - 1),
                    tile=n_tile_extent,
                    dtype="uint32",
                    ctx=ctx)
    d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx)

    domain = tiledb.Domain(d1, d2, ctx=ctx)

    v = tiledb.Attr(
        "v",
        filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]),
        dtype="uint8",
        ctx=ctx,
    )

    schema = tiledb.ArraySchema(
        domain=domain,
        attrs=(v, ),
        capacity=10000,
        cell_order="row-major",
        tile_order="row-major",
        sparse=True,
        ctx=ctx,
    )

    with tempfile.TemporaryDirectory() as tdir:

        path = os.path.join(tdir, "arr.tiledb")

        tiledb.SparseArray.create(path, schema)

        with tiledb.SparseArray(path, mode="w", ctx=ctx) as A:
            A[n_idxs, m_idxs] = values

        ctx2 = tiledb.Ctx()

        s = tiledb.SparseArray(path, mode="r", ctx=ctx2)
        vs1 = s[1:10, 1:50]

        _ = s[:, :]
        vs2 = s[1:10, 1:50]

        assert vs1["v"].shape[0] == vs2["v"].shape[0]
示例#7
0
    def _create_array(self):
        """
        Creates a TileDB array for a Tensorflow model
        """
        try:
            dom = tiledb.Domain(
                tiledb.Dim(name="model", domain=(1, 1), tile=1,
                           dtype=np.int32), )

            attrs = [
                tiledb.Attr(
                    name="model_weights",
                    dtype="S1",
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ),
                tiledb.Attr(
                    name="optimizer_weights",
                    dtype="S1",
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ),
            ]

            schema = tiledb.ArraySchema(
                domain=dom,
                sparse=False,
                attrs=attrs,
            )

            tiledb.Array.create(self.uri, schema)
        except tiledb.TileDBError as error:
            if "Error while listing with prefix" in str(error):
                # It is possible to land here if user sets wrong default s3 credentials
                # with respect to default s3 path
                raise HTTPError(
                    code=400,
                    msg=
                    f"Error creating file, {error} Are your S3 credentials valid?",
                )

            if "already exists" in str(error):
                logging.warning(
                    "TileDB array already exists but update=False. "
                    "Next time set update=True. Returning")
                raise error
示例#8
0
    def create_datahealtharray(self, uri):
        if uri.endswith("DAILY_METRICS"):
            dimension = tiledb.Dim(name='date',
                                   domain=(np.datetime64('1900-01-01'),
                                           np.datetime64('2262-01-01')),
                                   tile=np.timedelta64(365, 'ns'),
                                   dtype=np.datetime64('', 'ns').dtype)

            arraySchema = tiledb.ArraySchema(
                domain=tiledb.Domain(dimension),
                attrs=[
                    tiledb.Attr(name='midclose',
                                dtype='float64',
                                filters=tiledb.FilterList(
                                    [tiledb.GzipFilter(level=-1)],
                                    chunksize=512000)),
                    tiledb.Attr(name='logret',
                                dtype='float64',
                                filters=tiledb.FilterList(
                                    [tiledb.GzipFilter(level=-1)],
                                    chunksize=512000)),
                    tiledb.Attr(name='logret_ema',
                                dtype='float64',
                                filters=tiledb.FilterList(
                                    [tiledb.GzipFilter(level=-1)],
                                    chunksize=512000)),
                ],
                cell_order='row-major',
                tile_order='row-major',
                capacity=10000,
                sparse=True,
                allows_duplicates=False,
                coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                                 chunksize=512000),
                offsets_filters=tiledb.FilterList(
                    [tiledb.GzipFilter(level=-1)], chunksize=512000))

            tiledb.SparseArray.create(uri, arraySchema)
示例#9
0
    def store_df(self, datatype, name, df, sparse=True, data_df=True):
        uri = self.get_uri(datatype, name)
        array_existed = tiledb.highlevel.array_exists(uri)

        if not array_existed and data_df:
            if datatype == self._RAW_DATA:
                self.create_dataarray(uri)
            elif datatype == self._HEALTH_DATA:
                self.create_datahealtharray(uri)

            array_existed = True

        tiledb.from_pandas(
            uri,
            df,
            sparse=sparse,
            mode='append' if array_existed else 'ingest',
            tile_order='row_major',
            cell_order='row_major',
            attrs_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                            chunksize=512000),
            coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                             chunksize=512000))
示例#10
0
 def create_ndarray_array(ndarray_name, ndarray):
     filters = tiledb.FilterList([tiledb.ZstdFilter()])
     attrs = [tiledb.Attr(dtype=ndarray.dtype, filters=filters)]
     dimensions = [
         tiledb.Dim(
             domain=(0, ndarray.shape[dimension] - 1), tile=min(ndarray.shape[dimension], 1000), dtype=np.uint32
         )
         for dimension in range(ndarray.ndim)
     ]
     domain = tiledb.Domain(*dimensions)
     schema = tiledb.ArraySchema(
         domain=domain, sparse=False, attrs=attrs, capacity=1_000_000, cell_order="row-major", tile_order="row-major"
     )
     tiledb.DenseArray.create(ndarray_name, schema)
示例#11
0
def create_new_array(tdb_Context,
                     size,
                     array_out_name,
                     coord_tile_size,
                     task_tile_size,
                     attribute_config,
                     attribute_config_file,
                     compressor='gzip',
                     compression_level=-1,
                     var=False):
    '''
    Creates an empty tileDB array
    size= tuple(num_indices,num_tasks)
    '''
    coord_tile_size=min(size[0],coord_tile_size)
    task_tile_size=max([1,min(size[1],task_tile_size)])
    tiledb_dim_coords = tiledb.Dim(
        name='genome_coordinate',
        domain=(0, size[0]),
        tile=coord_tile_size,
        dtype='uint32')
    tiledb_dim_tasks=tiledb.Dim(
        name='task',
        domain=(0,size[1]),#max([1,size[1]])),
        tile=task_tile_size,
        dtype='uint32')
    tiledb_dom = tiledb.Domain(tiledb_dim_coords,tiledb_dim_tasks,ctx=tdb_Context)

    #generate the attribute information
    attribute_info=get_attribute_info(attribute_config,attribute_config_file)
    attribs=[]
    for key in attribute_info:
        attribs.append(tiledb.Attr(
            name=key,
            var=var,
            filters=tiledb.FilterList([tiledb.GzipFilter()]),
            dtype=attribute_info[key]['dtype']))
    
    tiledb_schema = tiledb.ArraySchema(
        domain=tiledb_dom,
        attrs=tuple(attribs),
        cell_order='row-major',
        tile_order='row-major')
    
    tiledb.DenseArray.create(array_out_name, tiledb_schema)
    print("created empty array on disk")
    return
示例#12
0
def create_emb(e_name, emb):
    """
    Embeddings are typically accessed with very large slices (or all of the embedding),
    and do not benefit from overly aggressive compression due to their format.  Given
    this, we use:
    * large tile size (1000)
    * default compression level
    """
    filters = tiledb.FilterList([tiledb.ZstdFilter()])
    attrs = [tiledb.Attr(dtype=emb.dtype, filters=filters)]
    dims = []
    for d in range(emb.ndim):
        shape = emb.shape
        dims.append(tiledb.Dim(domain=(0, shape[d] - 1), tile=min(shape[d], 1000), dtype=np.uint32))
    domain = tiledb.Domain(*dims)
    schema = tiledb.ArraySchema(
        domain=domain, sparse=False, attrs=attrs, capacity=1_000_000, cell_order="row-major", tile_order="row-major"
    )
    tiledb.DenseArray.create(e_name, schema)
示例#13
0
 def create_dataframe_array(array_name, dataframe):
     tiledb_filter = tiledb.FilterList(
         [
             # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and
             # other non-float data.
             tiledb.ZstdFilter(level=22),
         ]
     )
     attrs = [
         tiledb.Attr(name=column, dtype=get_dtype_of_array(dataframe[column]), filters=tiledb_filter)
         for column in dataframe
     ]
     domain = tiledb.Domain(
         tiledb.Dim(domain=(0, dataframe.shape[0] - 1), tile=min(dataframe.shape[0], 1000), dtype=np.uint32)
     )
     schema = tiledb.ArraySchema(
         domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major"
     )
     tiledb.DenseArray.create(array_name, schema)
示例#14
0
def create_X(X_name, shape):
    """
    Dense, always.  Future task: explore if sparse encoding is worth the trouble
    below a sparsity threshold.

    The X matrix is access in both row and column oriented patterns, depending on the
    particular operation.  Because of the data type, default compression works best.
    The tile size (50, 100) and global layout (row/col) was choosen empirically, by benchmarking
    the current cellxgene backend.
    """
    filters = tiledb.FilterList([tiledb.ZstdFilter()])
    attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
    domain = tiledb.Domain(
        tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 50), dtype=np.uint32),
        tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 100), dtype=np.uint32),
    )
    schema = tiledb.ArraySchema(
        domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="col-major"
    )
    tiledb.DenseArray.create(X_name, schema)
示例#15
0
def create_dataframe(name, df, ctx):
    """
    Current access patterns are oriented toward reading very large slices of
    the dataframe, one attribute at a time.  Attribute data also tends to be
    (often) repetitive (bools, categories, strings).
    Given this, we use:
    * a large tile size (1000)
    * very aggressive compression levels
    """
    filter = tiledb.FilterList(
        [
            # attempt aggressive compression as many of these dataframes are very repetitive
            # strings, bools and other non-float data.
            tiledb.ZstdFilter(level=22),
        ]
    )
    attrs = [tiledb.Attr(name=col, dtype=cxg_dtype(df[col]), filters=filter) for col in df]
    domain = tiledb.Domain(tiledb.Dim(domain=(0, df.shape[0] - 1), tile=min(df.shape[0], 1000), dtype=np.uint32))
    schema = tiledb.ArraySchema(
        domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major"
    )
    tiledb.DenseArray.create(name, schema)
示例#16
0
    def _create_array(self) -> None:
        """Create a TileDB array for a Tensorflow model"""
        assert self.model
        dom = tiledb.Domain(
            tiledb.Dim(name="model",
                       domain=(1, 1),
                       tile=1,
                       dtype=np.int32,
                       ctx=self.ctx) if
            isinstance(self.model, (Functional, Sequential)) else tiledb.Dim(
                name="model",
                domain=(1, len(self.model.layers)),
                tile=1,
                dtype=np.int32,
                ctx=self.ctx,
            ), )
        if isinstance(self.model, (Functional, Sequential)):
            attrs = [
                tiledb.Attr(
                    name="model_weights",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
                tiledb.Attr(
                    name="optimizer_weights",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
            ]
        else:
            attrs = [
                # String names of weights of each layer of the model
                tiledb.Attr(
                    name="weight_names",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
                # The values of weights of each layer of the model
                tiledb.Attr(
                    name="weight_values",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
                # Layer names TF format of the saved/loaded model
                tiledb.Attr(
                    name="layer_name",
                    dtype=str,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
                # The weight values of the optimizer in case the model is saved compiled
                tiledb.Attr(
                    name="optimizer_weights",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ),
            ]

        schema = tiledb.ArraySchema(
            domain=dom,
            sparse=False,
            attrs=attrs,
            ctx=self.ctx,
        )

        tiledb.Array.create(self.uri, schema, ctx=self.ctx)

        # In case we are on TileDB-Cloud we have to update model array's file properties
        if self.namespace:
            update_file_properties(self.uri, self._file_properties)
示例#17
0
    def create_dataarray(self, uri):
        dimension = tiledb.Dim(name='date',
                               domain=(np.datetime64('1900-01-01'),
                                       np.datetime64('2262-01-01')),
                               tile=np.timedelta64(365, 'ns'),
                               dtype=np.datetime64('', 'ns').dtype)

        domain = tiledb.Domain(dimension)

        attrs = [
            tiledb.Attr(name='bidopen',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='bidclose',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='bidhigh',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='bidlow',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='askopen',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='askclose',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='askhigh',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='asklow',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='tickqty',
                        dtype='int64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
        ]

        arraySchema = tiledb.ArraySchema(
            domain=domain,
            attrs=attrs,
            cell_order='row-major',
            tile_order='row-major',
            capacity=10000,
            sparse=True,
            allows_duplicates=False,
            coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                             chunksize=512000),
            offsets_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                              chunksize=512000))

        tiledb.SparseArray.create(uri, arraySchema)
示例#18
0
    def _create_array(self, uri, retry=0):
        """
        Create a new array for storing notebook file
        :param uri: location to create array
        :param name: name to register under
        :param retry: number of times to retry request
        :return:
        """
        try:
            # The array will be be 1 dimensional with domain of 0 to max uint64. We use a tile extent of 1024 bytes
            dom = tiledb.Domain(
                tiledb.Dim(
                    name="position",
                    domain=(0, numpy.iinfo(numpy.uint64).max - 1025),
                    tile=1024,
                    dtype=numpy.uint64,
                    ctx=tiledb.cloud.Ctx(),
                ),
                ctx=tiledb.cloud.Ctx(),
            )

            schema = tiledb.ArraySchema(
                domain=dom,
                sparse=True,
                attrs=[
                    tiledb.Attr(
                        name="contents",
                        dtype=numpy.uint8,
                        filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    )
                ],
                ctx=tiledb.cloud.Ctx(),
            )

            parts = uri.split("/")
            parts_len = len(parts)
            namespace = parts[parts_len - 2]
            array_name = parts[parts_len - 1]

            s3_prefix = get_s3_prefix(namespace)
            if s3_prefix is None:
                raise http_error(
                    400,
                    "You must set the default s3 prefix path for notebooks in {} profile settings"
                    .format(namespace),
                )

            tiledb_uri_s3 = "tiledb://{}/{}".format(namespace,
                                                    s3_prefix + array_name)

            # Create the (empty) array on disk.
            tiledb.SparseArray.create(tiledb_uri_s3, schema)

            tiledb_uri = "tiledb://{}/{}".format(namespace, array_name)
            time.sleep(0.25)
            tiledb.cloud.array.update_info(uri=tiledb_uri,
                                           array_name=array_name,
                                           tags=[TAG_JUPYTER_NOTEBOOK])

            return tiledb_uri, array_name
        except tiledb.TileDBError as e:
            if "already exists" in str(e):
                parts = uri.split("/")
                parts_length = len(parts)
                array_name = parts[parts_length - 1]

                array_name = self._increment_filename(array_name)

                parts[parts_length - 1] = array_name
                uri = "/".join(parts)

                return self._create_array(uri, retry)
            elif retry:
                retry -= 1
                return self._create_array(uri, retry)
        except HTTPError as e:
            raise e
        except Exception as e:
            raise http_error(400, "Error creating file %s " % str(e))

        return None
示例#19
0
    def _create_array(self, serialized_model_info: Mapping[str,
                                                           bytes]) -> None:
        """
        Create a TileDB array for a PyTorch model

        :param serialized_model_info: A mapping with pickled information of a PyTorch model.
        """
        dom = tiledb.Domain(
            tiledb.Dim(name="model",
                       domain=(1, 1),
                       tile=1,
                       dtype=np.int32,
                       ctx=self.ctx), )

        attrs = []

        # Keep model's state dictionary
        attrs.append(
            tiledb.Attr(
                name="model_state_dict",
                dtype=bytes,
                var=True,
                filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ctx=self.ctx,
            ), )

        # If optimizer is provided we also keep optimizer's state dictionary
        if self.optimizer:
            attrs.append(
                tiledb.Attr(
                    name="optimizer_state_dict",
                    dtype=bytes,
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                    ctx=self.ctx,
                ), )

        # Add extra attributes in case model information is provided by the user
        if serialized_model_info:
            for key in serialized_model_info:
                attrs.append(
                    tiledb.Attr(
                        name=key,
                        dtype=bytes,
                        var=True,
                        filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                        ctx=self.ctx,
                    ), )

        schema = tiledb.ArraySchema(
            domain=dom,
            sparse=False,
            attrs=attrs,
            ctx=self.ctx,
        )

        tiledb.Array.create(self.uri, schema, ctx=self.ctx)

        # In case we are on TileDB-Cloud we have to update model array's file properties
        if self.namespace:
            update_file_properties(self.uri, self._file_properties)
示例#20
0
def save_X(container, xdata, ctx, sparse_threshold, expect_sparse=False):
    # Save X count matrix
    X_name = f"{container}/X"

    shape = xdata.shape
    log(1, "\t...shape:", str(shape))

    col_shift = None
    if sparse_threshold == 100:
        is_sparse = True
    elif sparse_threshold == 0:
        is_sparse = False
    else:
        is_sparse, nnz, nelem = evaluate_for_sparse_encoding(xdata, sparse_threshold)
        percent = 100.0 * nnz / nelem
        if nelem != shape[0] * shape[1]:
            log(1, "\t...sparse=", is_sparse, "non-zeros percent (estimate): %6.2f" % percent)
        else:
            log(1, "\t...sparse=", is_sparse, "non-zeros:", nnz, "percent: %6.2f" % percent)

        is_sparse = percent < sparse_threshold
        if not is_sparse:
            col_shift, nnz, nelem = evaluate_for_sparse_column_shift_encoding(xdata, sparse_threshold)
            is_sparse = col_shift is not None
            percent = 100.0 * nnz / nelem
            if nelem != shape[0] * shape[1]:
                log(1, "\t...sparse=", is_sparse, "col shift non-zeros percent (estimate): %6.2f" % percent)
            else:
                log(1, "\t...sparse=", is_sparse, "col shift non-zeros:", nnz, "percent: %6.2f" % percent)

    if expect_sparse is True and is_sparse is False:
        return False

    create_X(X_name, shape, is_sparse)
    stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000)
    if is_sparse:
        if col_shift is not None:
            log(1, "\t...output X as sparse matrix with column shift encoding")
            X_col_shift_name = f"{container}/X_col_shift"
            filters = tiledb.FilterList([tiledb.ZstdFilter()])
            attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
            domain = tiledb.Domain(tiledb.Dim(domain=(0, shape[1] - 1), tile=min(shape[1], 5000), dtype=np.uint32))
            schema = tiledb.ArraySchema(domain=domain, attrs=attrs)
            tiledb.DenseArray.create(X_col_shift_name, schema)
            with tiledb.DenseArray(X_col_shift_name, mode="w", ctx=ctx) as X_col_shift:
                X_col_shift[:] = col_shift
            tiledb.consolidate(X_col_shift_name, ctx=ctx)
        else:
            log(1, "\t...output X as sparse matrix")

        with tiledb.SparseArray(X_name, mode="w", ctx=ctx) as X:
            nnz = 0
            for row in range(0, shape[0], stride):
                lim = min(row + stride, shape[0])
                a = xdata[row:lim, :]
                if type(a) is not np.ndarray:
                    a = a.toarray()
                if col_shift is not None:
                    a = a - col_shift
                indices = np.nonzero(a)
                trow = indices[0] + row
                nnz += indices[0].shape[0]
                X[trow, indices[1]] = a[indices[0], indices[1]]
                log(2, "\t...rows", lim, "of", shape[0], "nnz", nnz, "sparse", nnz / (lim * shape[1]))

    else:
        log(1, "\t...output X as dense matrix")
        with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X:
            for row in range(0, shape[0], stride):
                lim = min(row + stride, shape[0])
                a = xdata[row:lim, :]
                if type(a) is not np.ndarray:
                    a = a.toarray()
                X[row:lim, :] = a
                log(2, "\t...rows", row, "to", lim)

    tiledb.consolidate(X_name, ctx=ctx)
    if hasattr(tiledb, "vacuum"):
        tiledb.vacuum(X_name)

    return is_sparse
示例#21
0
def write_sparse_array(path, n, m, n_idxs, m_idxs, values, clip=True):
    if os.path.exists(path):
        raise FileExistsError("{} already exists".format(path))

    if n_idxs.min() < 0 or n_idxs.max() >= n:
        raise ValueError("row indexes must be in range [0, n - 1]")

    if m_idxs.min() < 0 or m_idxs.max() >= m:
        raise ValueError("column indexes must in in range [0, m - 1]")

    sparse = coo_matrix((values, (n_idxs, m_idxs)), dtype=np.int32)
    sparse = sparse.tocsc(copy=False).tocoo(copy=False)

    n_idxs = sparse.row
    m_idxs = sparse.col
    values = sparse.data

    if clip:
        values = np.minimum(values, VPLOT_MAX_VALUE)

    if values.min() < 0 or values.max() > VPLOT_MAX_VALUE:
        raise ValueError(
            "vplot values must be in range [0, {}]".format(VPLOT_MAX_VALUE))

    # ctx = tiledb.Ctx()

    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(
        GENOME_DOMAIN_NAME,
        domain=(0, n - 1),
        tile=n_tile_extent,
        dtype="uint32",
        ctx=ctx,
    )
    d2 = tiledb.Dim(INSERT_DOMAIN_NAME,
                    domain=(0, m - 1),
                    tile=m,
                    dtype="uint32",
                    ctx=ctx)

    domain = tiledb.Domain(d1, d2, ctx=ctx)

    v = tiledb.Attr(
        "v",
        filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]),
        dtype="uint8",
        ctx=ctx,
    )

    schema = tiledb.ArraySchema(
        ctx=ctx,
        domain=domain,
        attrs=(v, ),
        capacity=1000,
        cell_order="row-major",
        tile_order="row-major",
        sparse=True,
    )

    tiledb.SparseArray.create(path, schema)

    with tiledb.SparseArray(path, mode="w", ctx=ctx) as A:
        values = values.astype(np.uint8)
        # A[n_idxs, m_idxs] = {"v": values}
        A[n_idxs, m_idxs] = values