def create_matrix_array(matrix_name, number_of_rows, number_of_columns, encode_as_sparse_array): filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] if encode_as_sparse_array: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, number_of_rows - 1), tile=min(number_of_rows, 512), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, number_of_columns - 1), tile=min(number_of_columns, 2048), dtype=np.uint32), ) else: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, number_of_rows - 1), tile=min(number_of_rows, 50), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, number_of_columns - 1), tile=min(number_of_columns, 100), dtype=np.uint32), ) schema = tiledb.ArraySchema(domain=domain, sparse=encode_as_sparse_array, attrs=attrs, cell_order="row-major", tile_order="col-major") if encode_as_sparse_array: tiledb.SparseArray.create(matrix_name, schema) else: tiledb.DenseArray.create(matrix_name, schema)
def _create_array(self) -> None: """Create a TileDB array for a Sklearn model.""" dom = tiledb.Domain( tiledb.Dim(name="model", domain=(1, 1), tile=1, dtype=np.int32, ctx=self.ctx), ) attrs = [ tiledb.Attr( name="model_params", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), ] schema = tiledb.ArraySchema(domain=dom, sparse=False, attrs=attrs, ctx=self.ctx) tiledb.Array.create(self.uri, schema, ctx=self.ctx) # In case we are on TileDB-Cloud we have to update model array's file properties if self.namespace: update_file_properties(self.uri, self._file_properties)
def create_X(X_name, shape, is_sparse): """ The X matrix is accessed in both row and column oriented patterns, depending on the particular operation. Because of the data type, default compression works best. The tile size, (50, 100) for dense, and (512,2048) for sparse, and global layout (row/col) was chosen empirically, by benchmarking the current cellxgene backend. """ filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] if is_sparse: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 512), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 2048), dtype=np.uint32), ) else: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 50), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 100), dtype=np.uint32), ) schema = tiledb.ArraySchema( domain=domain, sparse=is_sparse, attrs=attrs, cell_order="row-major", tile_order="col-major" ) if is_sparse: tiledb.SparseArray.create(X_name, schema) else: tiledb.DenseArray.create(X_name, schema)
def _create_array(self): """ Creates a TileDB array for a Tensorflow model """ try: dom = tiledb.Domain( tiledb.Dim(name="model", domain=(1, 1), tile=1, dtype=np.int32), ) attrs = [ tiledb.Attr( name="model_weights", dtype="S1", var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ), tiledb.Attr( name="optimizer_weights", dtype="S1", var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ), ] schema = tiledb.ArraySchema( domain=dom, sparse=False, attrs=attrs, ) tiledb.Array.create(self.uri, schema) except tiledb.TileDBError as error: if "Error while listing with prefix" in str(error): # It is possible to land here if user sets wrong default s3 credentials # with respect to default s3 path raise HTTPError( code=400, msg= f"Error creating file, {error} Are your S3 credentials valid?", ) if "already exists" in str(error): logging.warning( "TileDB array already exists but update=False. " "Next time set update=True. Returning") raise error
def create_ndarray_array(ndarray_name, ndarray): filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=ndarray.dtype, filters=filters)] dimensions = [ tiledb.Dim( domain=(0, ndarray.shape[dimension] - 1), tile=min(ndarray.shape[dimension], 1000), dtype=np.uint32 ) for dimension in range(ndarray.ndim) ] domain = tiledb.Domain(*dimensions) schema = tiledb.ArraySchema( domain=domain, sparse=False, attrs=attrs, capacity=1_000_000, cell_order="row-major", tile_order="row-major" ) tiledb.DenseArray.create(ndarray_name, schema)
def create_emb(e_name, emb): """ Embeddings are typically accessed with very large slices (or all of the embedding), and do not benefit from overly aggressive compression due to their format. Given this, we use: * large tile size (1000) * default compression level """ filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=emb.dtype, filters=filters)] dims = [] for d in range(emb.ndim): shape = emb.shape dims.append(tiledb.Dim(domain=(0, shape[d] - 1), tile=min(shape[d], 1000), dtype=np.uint32)) domain = tiledb.Domain(*dims) schema = tiledb.ArraySchema( domain=domain, sparse=False, attrs=attrs, capacity=1_000_000, cell_order="row-major", tile_order="row-major" ) tiledb.DenseArray.create(e_name, schema)
def create_dataframe_array(array_name, dataframe): tiledb_filter = tiledb.FilterList( [ # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and # other non-float data. tiledb.ZstdFilter(level=22), ] ) attrs = [ tiledb.Attr(name=column, dtype=get_dtype_of_array(dataframe[column]), filters=tiledb_filter) for column in dataframe ] domain = tiledb.Domain( tiledb.Dim(domain=(0, dataframe.shape[0] - 1), tile=min(dataframe.shape[0], 1000), dtype=np.uint32) ) schema = tiledb.ArraySchema( domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major" ) tiledb.DenseArray.create(array_name, schema)
def create_X(X_name, shape): """ Dense, always. Future task: explore if sparse encoding is worth the trouble below a sparsity threshold. The X matrix is access in both row and column oriented patterns, depending on the particular operation. Because of the data type, default compression works best. The tile size (50, 100) and global layout (row/col) was choosen empirically, by benchmarking the current cellxgene backend. """ filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 50), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 100), dtype=np.uint32), ) schema = tiledb.ArraySchema( domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="col-major" ) tiledb.DenseArray.create(X_name, schema)
def create_dataframe(name, df, ctx): """ Current access patterns are oriented toward reading very large slices of the dataframe, one attribute at a time. Attribute data also tends to be (often) repetitive (bools, categories, strings). Given this, we use: * a large tile size (1000) * very aggressive compression levels """ filter = tiledb.FilterList( [ # attempt aggressive compression as many of these dataframes are very repetitive # strings, bools and other non-float data. tiledb.ZstdFilter(level=22), ] ) attrs = [tiledb.Attr(name=col, dtype=cxg_dtype(df[col]), filters=filter) for col in df] domain = tiledb.Domain(tiledb.Dim(domain=(0, df.shape[0] - 1), tile=min(df.shape[0], 1000), dtype=np.uint32)) schema = tiledb.ArraySchema( domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major" ) tiledb.DenseArray.create(name, schema)
def _create_array(self, serialized_model_info: Mapping[str, bytes]) -> None: """ Create a TileDB array for a PyTorch model :param serialized_model_info: A mapping with pickled information of a PyTorch model. """ dom = tiledb.Domain( tiledb.Dim(name="model", domain=(1, 1), tile=1, dtype=np.int32, ctx=self.ctx), ) attrs = [] # Keep model's state dictionary attrs.append( tiledb.Attr( name="model_state_dict", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), ) # If optimizer is provided we also keep optimizer's state dictionary if self.optimizer: attrs.append( tiledb.Attr( name="optimizer_state_dict", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), ) # Add extra attributes in case model information is provided by the user if serialized_model_info: for key in serialized_model_info: attrs.append( tiledb.Attr( name=key, dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), ) schema = tiledb.ArraySchema( domain=dom, sparse=False, attrs=attrs, ctx=self.ctx, ) tiledb.Array.create(self.uri, schema, ctx=self.ctx) # In case we are on TileDB-Cloud we have to update model array's file properties if self.namespace: update_file_properties(self.uri, self._file_properties)
def save_X(container, xdata, ctx, sparse_threshold, expect_sparse=False): # Save X count matrix X_name = f"{container}/X" shape = xdata.shape log(1, "\t...shape:", str(shape)) col_shift = None if sparse_threshold == 100: is_sparse = True elif sparse_threshold == 0: is_sparse = False else: is_sparse, nnz, nelem = evaluate_for_sparse_encoding(xdata, sparse_threshold) percent = 100.0 * nnz / nelem if nelem != shape[0] * shape[1]: log(1, "\t...sparse=", is_sparse, "non-zeros percent (estimate): %6.2f" % percent) else: log(1, "\t...sparse=", is_sparse, "non-zeros:", nnz, "percent: %6.2f" % percent) is_sparse = percent < sparse_threshold if not is_sparse: col_shift, nnz, nelem = evaluate_for_sparse_column_shift_encoding(xdata, sparse_threshold) is_sparse = col_shift is not None percent = 100.0 * nnz / nelem if nelem != shape[0] * shape[1]: log(1, "\t...sparse=", is_sparse, "col shift non-zeros percent (estimate): %6.2f" % percent) else: log(1, "\t...sparse=", is_sparse, "col shift non-zeros:", nnz, "percent: %6.2f" % percent) if expect_sparse is True and is_sparse is False: return False create_X(X_name, shape, is_sparse) stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000) if is_sparse: if col_shift is not None: log(1, "\t...output X as sparse matrix with column shift encoding") X_col_shift_name = f"{container}/X_col_shift" filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] domain = tiledb.Domain(tiledb.Dim(domain=(0, shape[1] - 1), tile=min(shape[1], 5000), dtype=np.uint32)) schema = tiledb.ArraySchema(domain=domain, attrs=attrs) tiledb.DenseArray.create(X_col_shift_name, schema) with tiledb.DenseArray(X_col_shift_name, mode="w", ctx=ctx) as X_col_shift: X_col_shift[:] = col_shift tiledb.consolidate(X_col_shift_name, ctx=ctx) else: log(1, "\t...output X as sparse matrix") with tiledb.SparseArray(X_name, mode="w", ctx=ctx) as X: nnz = 0 for row in range(0, shape[0], stride): lim = min(row + stride, shape[0]) a = xdata[row:lim, :] if type(a) is not np.ndarray: a = a.toarray() if col_shift is not None: a = a - col_shift indices = np.nonzero(a) trow = indices[0] + row nnz += indices[0].shape[0] X[trow, indices[1]] = a[indices[0], indices[1]] log(2, "\t...rows", lim, "of", shape[0], "nnz", nnz, "sparse", nnz / (lim * shape[1])) else: log(1, "\t...output X as dense matrix") with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X: for row in range(0, shape[0], stride): lim = min(row + stride, shape[0]) a = xdata[row:lim, :] if type(a) is not np.ndarray: a = a.toarray() X[row:lim, :] = a log(2, "\t...rows", row, "to", lim) tiledb.consolidate(X_name, ctx=ctx) if hasattr(tiledb, "vacuum"): tiledb.vacuum(X_name) return is_sparse
def _create_array(self, uri, retry=0): """ Create a new array for storing notebook file :param uri: location to create array :param name: name to register under :param retry: number of times to retry request :return: """ try: # The array will be be 1 dimensional with domain of 0 to max uint64. We use a tile extent of 1024 bytes dom = tiledb.Domain( tiledb.Dim( name="position", domain=(0, numpy.iinfo(numpy.uint64).max - 1025), tile=1024, dtype=numpy.uint64, ctx=tiledb.cloud.Ctx(), ), ctx=tiledb.cloud.Ctx(), ) schema = tiledb.ArraySchema( domain=dom, sparse=True, attrs=[ tiledb.Attr( name="contents", dtype=numpy.uint8, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ) ], ctx=tiledb.cloud.Ctx(), ) parts = uri.split("/") parts_len = len(parts) namespace = parts[parts_len - 2] array_name = parts[parts_len - 1] s3_prefix = get_s3_prefix(namespace) if s3_prefix is None: raise http_error( 400, "You must set the default s3 prefix path for notebooks in {} profile settings" .format(namespace), ) tiledb_uri_s3 = "tiledb://{}/{}".format(namespace, s3_prefix + array_name) # Create the (empty) array on disk. tiledb.SparseArray.create(tiledb_uri_s3, schema) tiledb_uri = "tiledb://{}/{}".format(namespace, array_name) time.sleep(0.25) tiledb.cloud.array.update_info(uri=tiledb_uri, array_name=array_name, tags=[TAG_JUPYTER_NOTEBOOK]) return tiledb_uri, array_name except tiledb.TileDBError as e: if "already exists" in str(e): parts = uri.split("/") parts_length = len(parts) array_name = parts[parts_length - 1] array_name = self._increment_filename(array_name) parts[parts_length - 1] = array_name uri = "/".join(parts) return self._create_array(uri, retry) elif retry: retry -= 1 return self._create_array(uri, retry) except HTTPError as e: raise e except Exception as e: raise http_error(400, "Error creating file %s " % str(e)) return None
def _create_array(self) -> None: """Create a TileDB array for a Tensorflow model""" assert self.model dom = tiledb.Domain( tiledb.Dim(name="model", domain=(1, 1), tile=1, dtype=np.int32, ctx=self.ctx) if isinstance(self.model, (Functional, Sequential)) else tiledb.Dim( name="model", domain=(1, len(self.model.layers)), tile=1, dtype=np.int32, ctx=self.ctx, ), ) if isinstance(self.model, (Functional, Sequential)): attrs = [ tiledb.Attr( name="model_weights", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), tiledb.Attr( name="optimizer_weights", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), ] else: attrs = [ # String names of weights of each layer of the model tiledb.Attr( name="weight_names", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), # The values of weights of each layer of the model tiledb.Attr( name="weight_values", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), # Layer names TF format of the saved/loaded model tiledb.Attr( name="layer_name", dtype=str, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), # The weight values of the optimizer in case the model is saved compiled tiledb.Attr( name="optimizer_weights", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), ] schema = tiledb.ArraySchema( domain=dom, sparse=False, attrs=attrs, ctx=self.ctx, ) tiledb.Array.create(self.uri, schema, ctx=self.ctx) # In case we are on TileDB-Cloud we have to update model array's file properties if self.namespace: update_file_properties(self.uri, self._file_properties)