def execute(cls, ctx, op): tiledb_config = tiledb.Config(op.tiledb_config) uri = op.tiledb_uri key = op.tiledb_key tiledb.consolidate(config=tiledb_config, uri=uri, key=key) ctx[op.outputs[0].key] = ctx[op.inputs[0].key]
def convert_ndarray_to_cxg_dense_array(ndarray_name, ndarray, ctx): """ Saves contents of ndarray to the CXG output directory specified. Generally this function is used to convert dataset embeddings. Because embeddings are typically accessed with very large slices (or all of the embedding), they do not benefit from overly aggressive compression due to their format. Given this, we use a large tile size (1000) but only default compression level. """ def create_ndarray_array(ndarray_name, ndarray): filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=ndarray.dtype, filters=filters)] dimensions = [ tiledb.Dim(domain=(0, ndarray.shape[dimension] - 1), tile=min(ndarray.shape[dimension], 1000), dtype=np.uint32) for dimension in range(ndarray.ndim) ] domain = tiledb.Domain(*dimensions) schema = tiledb.ArraySchema(domain=domain, sparse=False, attrs=attrs, capacity=1_000_000, cell_order="row-major", tile_order="row-major") tiledb.DenseArray.create(ndarray_name, schema) create_ndarray_array(ndarray_name, ndarray) with tiledb.DenseArray(ndarray_name, mode="w", ctx=ctx) as array: array[:] = ndarray tiledb.consolidate(ndarray_name, ctx=ctx)
def write_anndata_x_matrix_to_cxg(self, output_cxg_directory, ctx, sparse_threshold): matrix_container = f"{output_cxg_directory}/X" x_matrix_data = self.anndata.X is_sparse = is_matrix_sparse(x_matrix_data, sparse_threshold) if not is_sparse: col_shift = get_column_shift_encode_for_matrix( x_matrix_data, sparse_threshold) is_sparse = col_shift is not None else: col_shift = None if col_shift is not None: logging.info( "Converting matrix X as sparse matrix with column shift encoding" ) x_col_shift_name = f"{output_cxg_directory}/X_col_shift" convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift, ctx) convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse, ctx, col_shift) tiledb.consolidate(matrix_container, ctx=ctx) if hasattr(tiledb, "vacuum"): tiledb.vacuum(matrix_container)
def _tiledb_array(self, uri: str, schema: tiledb.ArraySchema) -> Iterator[tiledb.Array]: tiledb.Array.create(uri, schema) with tiledb.open(uri, mode="w") as tdb: yield tdb tiledb.consolidate(uri, config=self.config) tiledb.vacuum(uri, config=self.config)
def consolidate_fragments( uri, amplification, buffer_size, step_max_frags, step_min_frags, step_size_ratio, steps, vacuum, ): """ Consolidate the fragments in an array located at uri. """ config = tiledb.Config() config["sm.consolidation.mode"] = "fragments" config["sm.consolidation.amplification"] = amplification config["sm.consolidation.buffer_size"] = buffer_size config["sm.consolidation.step_max_frags"] = step_max_frags config["sm.consolidation.step_min_frags"] = step_min_frags config["sm.consolidation.step_size_ratio"] = step_size_ratio config["sm.consolidation.steps"] = steps ctx = tiledb.Ctx(config) tiledb.consolidate(uri, ctx=ctx) print(vacuum) if vacuum: config = tiledb.Config({"sm.vacuum.mode": "fragments"}) tiledb.vacuum(uri, ctx=tiledb.Ctx(config)) print("here?")
def save_embeddings(container, adata, ctx): for (name, value) in adata.obsm.items(): if is_valid_embedding(adata, name, value): e_name = f"{container}/{name[2:]}" create_emb(e_name, value) with tiledb.DenseArray(e_name, mode="w", ctx=ctx) as A: A[:] = value tiledb.consolidate(e_name, ctx=ctx) log(1, f"\t\t...{name} embedding created")
def consolidate_array_metadata(uri, vacuum): """ Consolidate the array metadata in an array located at uri. """ config = tiledb.Config() config["sm.consolidation.mode"] = "array_meta" ctx = tiledb.Ctx(config) tiledb.consolidate(uri, ctx=ctx) if vacuum: config = tiledb.Config({"sm.vacuum.mode": "array_meta"}) tiledb.vacuum(uri, ctx=tiledb.Ctx(config))
def convert_dataframe_to_cxg_array(cxg_container, dataframe_name, dataframe, index_column_name, ctx): """ Saves the contents of the dataframe to the CXG output directory specified. Current access patterns are oriented toward reading very large slices of the dataframe, one attribute at a time. Attribute data also tends to be (often) repetitive (bools, categories, strings). Given this, we use a large tile size (1000) and very aggressive compression levels. """ def create_dataframe_array(array_name, dataframe): tiledb_filter = tiledb.FilterList([ # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and # other non-float data. tiledb.ZstdFilter(level=22), ]) attrs = [ tiledb.Attr(name=column, dtype=get_dtype_of_array(dataframe[column]), filters=tiledb_filter) for column in dataframe ] domain = tiledb.Domain( tiledb.Dim(domain=(0, dataframe.shape[0] - 1), tile=min(dataframe.shape[0], 1000), dtype=np.uint32)) schema = tiledb.ArraySchema(domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major") tiledb.DenseArray.create(array_name, schema) array_name = f"{cxg_container}/{dataframe_name}" create_dataframe_array(array_name, dataframe) with tiledb.DenseArray(array_name, mode="w", ctx=ctx) as array: value = {} schema_hints = {} for column_name, column_values in dataframe.items(): dtype, hints = get_dtype_and_schema_of_array(column_values) value[column_name] = column_values.to_numpy(dtype=dtype) if hints: schema_hints.update({column_name: hints}) schema_hints.update({"index": index_column_name}) array[:] = value array.meta["cxg_schema"] = json.dumps(schema_hints) tiledb.consolidate(array_name, ctx=ctx)
def save_dataframe(container, name, df, index_col_name, ctx): A_name = f"{container}/{name}" (df, index_col_name) = alias_index_col(df, name, index_col_name) create_dataframe(A_name, df, ctx=ctx) with tiledb.DenseArray(A_name, mode="w", ctx=ctx) as A: value = {} schema_hints = {} for k, v in df.items(): dtype, hints = cxg_type(v) value[k] = v.to_numpy(dtype=dtype) if hints: schema_hints.update({k: hints}) schema_hints.update({"index": index_col_name}) A[:] = value A.meta["cxg_schema"] = json.dumps(schema_hints) tiledb.consolidate(A_name, ctx=ctx)
def save_X(container, adata, ctx): # Save X count matrix X_name = f"{container}/X" shape = adata.X.shape create_X(X_name, shape) stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000) with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X: for row in range(0, shape[0], stride): lim = min(row + stride, shape[0]) a = adata.X[row:lim, :] if type(a) is not np.ndarray: a = a.toarray() X[row:lim, :] = a log(2, "\t...rows", row, "to", lim) tiledb.consolidate(X_name, ctx=ctx) tiledb.consolidate(X_name, ctx=ctx)
def _run_consolidate(self, domain_names, data_array_name, verbose=False): # Consolidate at the end of the append operations to make the resultant # array more performant. config_key_name = "sm.consolidation.steps" config_key_value = 100 if self.ctx is None: config = tiledb.Config({config_key_name: config_key_value}) ctx = tiledb.Ctx(config) else: cfg_dict = self.ctx.config().dict() cfg_dict[config_key_name] = config_key_value ctx = tiledb.Ctx(config=tiledb.Config(cfg_dict)) for i, domain_name in enumerate(domain_names): if verbose: print() # Clear last carriage-returned print statement. print(f'Consolidating array: {i+1}/{len(domain_names)}', end="\r") else: print('Consolidating...') array_path = self.array_path.construct_path( domain_name, data_array_name) tiledb.consolidate(array_path, ctx=ctx)
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="input cxg directory") parser.add_argument("output", help="output cxg directory") parser.add_argument("--overwrite", action="store_true", help="replace output cxg directory") parser.add_argument("--verbose", "-v", action="count", default=0, help="verbose output") parser.add_argument( "--sparse-threshold", "-s", type=float, default=5.0, # default is 5% non-zero values help= "The X array will be sparse if the percent of non-zeros falls below this value", ) args = parser.parse_args() if os.path.exists(args.output): print("output dir exists:", args.output) if args.overwrite: print("output dir removed:", args.output) shutil.rmtree(args.output) else: print("use the overwrite option to remove the output directory") sys.exit(1) if not os.path.isdir(args.input): print("input is not a directory", args.input) sys.exit(1) shutil.copytree(args.input, args.output, ignore=shutil.ignore_patterns("X", "X_col_shift")) ctx = tiledb.Ctx({ "sm.num_reader_threads": 32, "sm.num_writer_threads": 32, "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024, }) with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r", ctx=ctx) as X_in: x_matrix_data = X_in[:, :] matrix_container = args.output is_sparse = is_matrix_sparse(x_matrix_data, args.sparse_threshold) if not is_sparse: col_shift = get_column_shift_encode_for_matrix( x_matrix_data, args.sparse_threshold) is_sparse = col_shift is not None else: col_shift = None if col_shift is not None: x_col_shift_name = f"{args.output}/X_col_shift" convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift, ctx) tiledb.consolidate(matrix_container, ctx=ctx) if is_sparse: convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse, ctx, col_shift) tiledb.consolidate(matrix_container, ctx=ctx) if not is_sparse: print("The array is not sparse, cleaning up, abort.") shutil.rmtree(args.output) sys.exit(1)
def save_X(container, xdata, ctx, sparse_threshold, expect_sparse=False): # Save X count matrix X_name = f"{container}/X" shape = xdata.shape log(1, "\t...shape:", str(shape)) col_shift = None if sparse_threshold == 100: is_sparse = True elif sparse_threshold == 0: is_sparse = False else: is_sparse, nnz, nelem = evaluate_for_sparse_encoding(xdata, sparse_threshold) percent = 100.0 * nnz / nelem if nelem != shape[0] * shape[1]: log(1, "\t...sparse=", is_sparse, "non-zeros percent (estimate): %6.2f" % percent) else: log(1, "\t...sparse=", is_sparse, "non-zeros:", nnz, "percent: %6.2f" % percent) is_sparse = percent < sparse_threshold if not is_sparse: col_shift, nnz, nelem = evaluate_for_sparse_column_shift_encoding(xdata, sparse_threshold) is_sparse = col_shift is not None percent = 100.0 * nnz / nelem if nelem != shape[0] * shape[1]: log(1, "\t...sparse=", is_sparse, "col shift non-zeros percent (estimate): %6.2f" % percent) else: log(1, "\t...sparse=", is_sparse, "col shift non-zeros:", nnz, "percent: %6.2f" % percent) if expect_sparse is True and is_sparse is False: return False create_X(X_name, shape, is_sparse) stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000) if is_sparse: if col_shift is not None: log(1, "\t...output X as sparse matrix with column shift encoding") X_col_shift_name = f"{container}/X_col_shift" filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] domain = tiledb.Domain(tiledb.Dim(domain=(0, shape[1] - 1), tile=min(shape[1], 5000), dtype=np.uint32)) schema = tiledb.ArraySchema(domain=domain, attrs=attrs) tiledb.DenseArray.create(X_col_shift_name, schema) with tiledb.DenseArray(X_col_shift_name, mode="w", ctx=ctx) as X_col_shift: X_col_shift[:] = col_shift tiledb.consolidate(X_col_shift_name, ctx=ctx) else: log(1, "\t...output X as sparse matrix") with tiledb.SparseArray(X_name, mode="w", ctx=ctx) as X: nnz = 0 for row in range(0, shape[0], stride): lim = min(row + stride, shape[0]) a = xdata[row:lim, :] if type(a) is not np.ndarray: a = a.toarray() if col_shift is not None: a = a - col_shift indices = np.nonzero(a) trow = indices[0] + row nnz += indices[0].shape[0] X[trow, indices[1]] = a[indices[0], indices[1]] log(2, "\t...rows", lim, "of", shape[0], "nnz", nnz, "sparse", nnz / (lim * shape[1])) else: log(1, "\t...output X as dense matrix") with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X: for row in range(0, shape[0], stride): lim = min(row + stride, shape[0]) a = xdata[row:lim, :] if type(a) is not np.ndarray: a = a.toarray() X[row:lim, :] = a log(2, "\t...rows", row, "to", lim) tiledb.consolidate(X_name, ctx=ctx) if hasattr(tiledb, "vacuum"): tiledb.vacuum(X_name) return is_sparse