def create_test_array_sparse_25x12_mult(temp_rootdir): """ Create a simple sparse test array. """ path = os.path.abspath(os.path.join(temp_rootdir, "sparse_25x12_mult")) ctx = tiledb.default_ctx() rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64) cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64) dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx) att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64) att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64) schema = tiledb.ArraySchema(ctx=ctx, sparse=True, domain=dom, attrs=(att1, att2)) tiledb.SparseArray.create(path, schema) coords = np.array(list(itertools.product(np.arange(1, 26), np.arange(1, 13)))) rows = coords[:, 0] cols = coords[:, 1] data = np.arange(300) with tiledb.SparseArray(path, mode="w", timestamp=1) as A: A[rows, cols] = {"a": data, "b": data} with tiledb.SparseArray(path, mode="w", timestamp=2) as A: A[rows, cols] = {"a": data / 2, "b": data * 2}
def test_tiledb_test(): import tiledb n = 1000 m = 1000 num_vals = 1000 n_idxs = np.sort(np.random.choice(n, num_vals, replace=False)) m_idxs = np.sort(np.random.choice(m, num_vals, replace=False)) values = np.random.randint(0, 100, num_vals, np.uint8) ctx = tiledb.Ctx() n_tile_extent = min(100, n) d1 = tiledb.Dim("ndom", domain=(0, n - 1), tile=n_tile_extent, dtype="uint32", ctx=ctx) d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx) domain = tiledb.Domain(d1, d2, ctx=ctx) v = tiledb.Attr( "v", filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]), dtype="uint8", ctx=ctx, ) schema = tiledb.ArraySchema( domain=domain, attrs=(v, ), capacity=10000, cell_order="row-major", tile_order="row-major", sparse=True, ctx=ctx, ) with tempfile.TemporaryDirectory() as tdir: path = os.path.join(tdir, "arr.tiledb") tiledb.SparseArray.create(path, schema) with tiledb.SparseArray(path, mode="w", ctx=ctx) as A: A[n_idxs, m_idxs] = values ctx2 = tiledb.Ctx() s = tiledb.SparseArray(path, mode="r", ctx=ctx2) vs1 = s[1:10, 1:50] _ = s[:, :] vs2 = s[1:10, 1:50] assert vs1["v"].shape[0] == vs2["v"].shape[0]
def test_ingest_csv_sparse_array_apppend_header_mismatch( udf_uri, array_name, key, secret, namespace, bucket, config ): """ Create a sparse array from a CSV file using ingest_csv() in the default ingest mode and then append additional data to it using the append mode. The appended data contains header names that do not match the data in the sparse array and must be renamed. """ tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment_sparse1"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, mode="ingest", full_domain=True, index_col=("x"), name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.SparseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) number_of_rows = data.shape[0] assert number_of_rows == 20 tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment_sparse2_mismatch"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, mode="append", full_domain=True, index_col=("x"), header=0, names=["x", "c", "b", "a"], name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.SparseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) for col, attribute in enumerate(("a", "b", "c"), 1): assert_array_equal( data[attribute], np.array([row * 10 + col for row in range(1, 21)] * 2), )
def main(): ctx = tiledb.Ctx() # Create dimensions d1 = tiledb.Dim(ctx, "d1", domain=(1, 4), tile=2, dtype="uint64") d2 = tiledb.Dim(ctx, "d2", domain=(1, 4), tile=2, dtype="uint64") # Create domain domain = tiledb.Domain(ctx, d1, d2) # Create attributes a1 = tiledb.Attr(ctx, "a1", compressor=('blosc-lz', -1), dtype="int32") a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="S10") a3 = tiledb.Attr(ctx, "a3", compressor=('zstd', -1), dtype='float32,float32') # Create sparse array tiledb.SparseArray(ctx, "my_sparse_array", domain=domain, attrs=(a1, a2, a3), capacity=2, cell_order='row-major', tile_order='row-major')
def test_ingest_csv_sparse_array( udf_uri, array_name, key, secret, namespace, bucket, config ): """ Create a sparse array from a CSV file using ingest_csv(). """ tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, name=udf_uri, # unittest/test_ingest_csv --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.SparseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) for col, attribute in enumerate(("a", "b", "c"), 1): assert_array_equal( data[attribute], np.array([row * 10 + col for row in range(1, 21)]), )
def test_store_tiledb_execution(setup): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: # store TileDB dense array expected = np.random.rand(8, 4, 3) a = tensor(expected, chunk_size=(3, 3, 2)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(expected, arr.read_direct()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store tensor with 1 chunk to TileDB dense array a = arange(12) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(np.arange(12), arr.read_direct()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store 2-d TileDB sparse array expected = sps.random(8, 7, density=0.1) a = tensor(expected, chunk_size=(3, 5)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr: data = arr[:, :] coords = data['coords'] value = data[arr.attr(0).name] ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim)) result = sps.coo_matrix((value, ij), shape=arr.shape) np.testing.assert_allclose(expected.toarray(), result.toarray()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store TileDB dense array expected = np.asfortranarray(np.random.rand(8, 4, 3)) a = tensor(expected, chunk_size=(3, 3, 2)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(expected, arr.read_direct()) assert arr.schema.cell_order == 'col-major' finally: shutil.rmtree(tempdir)
def test_sparse_schema(self): ctx = tiledb.Ctx() # create dimensions d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64") d2 = tiledb.Dim(ctx, "d2", domain=(101, 10000), tile=100, dtype="uint64") # create domain domain = tiledb.Domain(ctx, d1, d2) # create attributes a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32") a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32") # create sparse array with schema schema = tiledb.SparseArray(ctx, self.path("sparse_array_schema"), domain=domain, attrs=(a1, a2), capacity=10, cell_order='col-major', tile_order='row-major', coords_compressor=('zstd', 4), offsets_compressor=('blosc-lz', 5)) self.assertEqual(schema.capacity, 10) self.assertEqual(schema.cell_order, "col-major") self.assertEqual(schema.tile_order, "row-major") self.assertEqual(schema.coords_compressor, ('zstd', 4)) self.assertEqual(schema.offsets_compressor, ('blosc-lz', 5))
def test_ingest_csv_sparse_array_null_replace( udf_uri, array_name, key, secret, namespace, bucket, config ): """ From a CSV file containing NaNs, produce a sparse array using ingest_csv() where the NaNs are replaced with the value given by fillna. """ tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment_nulls"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, fillna=123, name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.SparseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) assert_array_equal(data["a"], np.array([1, 1, 1])) assert_array_equal(data["b"], np.array([2, 2, 123])) assert_array_equal(data["c"], np.array([3, 123, 123]))
def __init__(self, array, mode="r", subsample_rate=None): """subsample_rate is the probability of *keeping* an entry.""" if array is None: # empty array -- this is used by DNAFragMultiArray self._arr = None elif isinstance(array, DNAFragArray): # copy construcor -- this is used by DNAFragMultiArray # TODO: this is "dangerous" shallow copy of a filehandleself. # We should probably open a new handle on the path, in case the # "other" array is destroyed and its file handle closes. self._arr = array._arr self.subsample_rate = array.subsample_rate elif isinstance(array, tiledb.SparseArray): self._arr = array elif isinstance(array, NumpyBackend): # used for testing self._arr = array else: assert os.path.exists(array) self._arr = tiledb.SparseArray(array, ctx=ctx, mode="r") if self._arr is not None: (n, m) = self._arr.shape nonempty = self._arr.nonempty_domain() if nonempty is None: raise ValueError("Array is empty") # NB: NB: this is not necessary with tiledb >= v0.2.1 # NB: this seems to be necessary to prevent the array appearing empty # _ = self._arr.query(attrs=[COUNTS_RANGE_NAME], coords=True)[:, :] if subsample_rate is not None: assert subsample_rate >= 0. and subsample_rate <= 1. self.subsample_rate = subsample_rate
def execute(cls, ctx, op): tiledb_ctx = get_tiledb_ctx(op.tiledb_config) uri = op.tiledb_uri key = op.tiledb_key timestamp = op.tiledb_timestamp axis_offsets = op.axis_offsets chunk = op.outputs[0] if not chunk.issparse(): # dense to_store = np.ascontiguousarray(ctx[op.input.key]) slcs = [] for axis in range(chunk.ndim): axis_offset = int(axis_offsets[axis]) axis_length = int(op.input.shape[axis]) slcs.append(slice(axis_offset, axis_offset + axis_length)) with tiledb.DenseArray(uri=uri, ctx=tiledb_ctx, mode='w', key=key, timestamp=timestamp) as arr: arr[tuple(slcs)] = to_store ctx[chunk.key] = np.empty((0,) * chunk.ndim, dtype=chunk.dtype) else: # sparse to_store = ctx[op.input.key].spmatrix.tocoo() if to_store.nnz > 0: with tiledb.SparseArray(uri=uri, ctx=tiledb_ctx, mode='w', key=key, timestamp=timestamp) as arr: if chunk.ndim == 1: vec = to_store.col if to_store.shape[0] == 1 else to_store.row vec += axis_offsets[0] arr[vec] = to_store.data else: i, j = to_store.row + axis_offsets[0], to_store.col + axis_offsets[1] arr[i, j] = to_store.data ctx[chunk.key] = SparseNDArray(sps.csr_matrix((0, 0), dtype=chunk.dtype), shape=chunk.shape)
def read_array_cloud_local(rest_address, array_uri, token): tiledb.cloud.login(host=rest_address, token=token) from_time = int(get_timestamp_from_text("2020-02-17 00:00:00")) print(from_time) to_time = int(get_timestamp_from_text("2020-02-17 23:59:59")) print(to_time) with tiledb.SparseArray(array_uri, ctx=tiledb.cloud.Ctx()) as A: # res = np.mean(A[:]["importance"]) # print(res) # Returns all articles, if extreme bounds are added to regions, # there are less results res = A[:, from_time:to_time, :] # res = A[1:5000, from_time:to_time, 400000000:500000000] # <- for 2020-02-17 there are 217 results instead of 268 # Returns all articles, need +1 because upper bound of the slice is not # inclusive # res = A[:, :, 443392586:443557994+1] article_dict = {} for i in range(len(res["title"])): article_dict[res["coords"][i][2]] = ( get_datetime_from_timestamp(res["coords"][i][1]), res["medium_text"][i], res["title"][i], ) for i in sorted(article_dict.keys()): print("%s: %s" % (i, article_dict[i])) print(len(res["title"]))
def write_array_sparse(): # Open the array and write to it. with tiledb.SparseArray(array_name_sp, mode='w') as A: # Write some simple data to cells (1, 1), (2, 4) and (2, 3). I, J = [1, 2, 2], [1, 4, 3] data = np.array(([1, 2, 3])) A[I, J] = data
def read_array(): ctx = tiledb.Ctx() with tiledb.SparseArray(ctx, array_name, mode='r') as A: data = A[1:11] a_vals = data["a"] for i, coord in enumerate(data["coords"]): print("Cell (%d, %d) has data %d" % (coord[0], coord[1], a_vals[i]))
def execute(cls, ctx, op): import tiledb chunk = op.outputs[0] from ..array_utils import array_module from ..utils import get_tiledb_ctx xp = array_module(op.gpu) axis_offsets = [ offset + dim_start for offset, dim_start in zip(op.axis_offsets, op.tiledb_dim_starts) ] tiledb_ctx = get_tiledb_ctx(op.tiledb_config) uri = op.tiledb_uri key = op.tiledb_key timestamp = op.tiledb_timestamp slcs = [] for axis in range(chunk.ndim): axis_offset = axis_offsets[axis] axis_length = chunk.shape[axis] slcs.append(slice(axis_offset, axis_offset + axis_length)) if not op.sparse: # read dense array from tiledb with tiledb.DenseArray(uri=uri, ctx=tiledb_ctx, key=key, timestamp=timestamp) as tiledb_arr: ctx[chunk.key] = tiledb_arr[tuple(slcs)] else: # read sparse array from tiledb with tiledb.SparseArray(uri=uri, ctx=tiledb_ctx, key=key, timestamp=timestamp) as tiledb_arr: if tiledb_arr.ndim > 2: raise NotImplementedError( 'Does not support to read array with more than 2 dimensions' ) data = tiledb_arr[tuple(slcs)] coords = data['coords'] value = data[tiledb_arr.attr(0).name] if tiledb_arr.ndim == 2: # 2-d ij = tuple(coords[tiledb_arr.domain.dim(k).name] - axis_offsets[k] for k in range(tiledb_arr.ndim)) spmatrix = sps.coo_matrix((value, ij), shape=chunk.shape) ctx[chunk.key] = SparseNDArray(spmatrix) else: # 1-d ij = xp.zeros(coords.shape), \ coords[tiledb_arr.domain.dim(0).name] - axis_offsets[0] spmatrix = sps.coo_matrix((value, ij), shape=(1, ) + chunk.shape) ctx[chunk.key] = SparseNDArray(spmatrix, shape=chunk.shape)
def read_array_sparse(): # Open the array and read from it. with tiledb.SparseArray(array_name_sp, mode='r') as A: # Slice only rows 1, 2 and cols 2, 3, 4. data = A[1:3, 2:5] a_vals = data["a"] for i, coord in enumerate(data["coords"]): print("Cell (%d, %d) has data %d" % (coord[0], coord[1], a_vals[i]))
def read_array_s3(rest_adress, array_uri, token): config = tiledb.Config() config["rest.token"] = token config["rest.server_address"] = rest_adress config["vfs.s3.region"] = "eu-central-1" ctx = tiledb.Ctx(config) with tiledb.SparseArray(array_uri, ctx=ctx) as A: print(A[:]["title"])
def write_array(): ctx = tiledb.Ctx() with tiledb.SparseArray(ctx, array_name, mode='w') as A: I, J = [1, 2, 2], [1, 4, 3] data = np.array(([1, 2, 3])) A[I, J] = data I, J = [4, 2], [1, 4] data = np.array(([4, 20])) A[I, J] = data
def exec_and_fetch( query, output_uri, output_schema=None, namespace=None, task_name=None, output_array_name=None, init_commands=None, parameters=None, ): """ Run a sql query, results are not stored :param str query: query to run :param str output_uri: array to store results to, must be either a tiledb:// for an already registered array or a s3:// if passing a new schema to create new output array :param tiledb.ArraySchema output_schema: array schema to create output array with :param str namespace: optional namespace to charge the query to :param str task_name: optional name to assign the task for logging and audit purposes :param str output_array_name: optional name for registering new output array if output_schema schema is passed :param list init_commands: optional list of sql queries or commands to run before main query :param list parameters: optional list of sql parameters for use in query :return: TileDB Array with results """ # If the namespace is not set, we will default to the user's namespace if namespace is None: # Fetch the client profile for username if it is not already cached if config.user is None: config.user = client.user_profile() namespace = config.user.username # Execute the sql query try: exec( query=query, output_uri=output_uri, output_schema=output_schema, namespace=namespace, task_name=task_name, output_array_name=output_array_name, init_commands=init_commands, parameters=parameters, ) # Fetch output schema to check if its sparse or dense schema = tiledb.ArraySchema.load(output_uri, ctx=client.Ctx()) if schema.sparse: return tiledb.SparseArray(output_uri, ctx=client.Ctx()) return tiledb.DenseArray(output_uri, ctx=client.Ctx()) except GenApiException as exc: raise tiledb_cloud_error.check_exc(exc) from None
def read_array(order): ctx = tiledb.Ctx() with tiledb.SparseArray(ctx, array_name, mode='r') as A: print("Non-empty domain: {}".format(A.nonempty_domain())) data = A.query(attrs=["a"], order=order, coords=True)[1:100] a_vals = data["a"] coords = data["coords"] for i in range(coords.shape[0]): print("Cell {} has data {}".format(str(coords[i]), str(a_vals[i])))
def main(): ctx = tiledb.Ctx() # create dimensions d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64") d2 = tiledb.Dim(ctx, "d2", domain=(101, 10000), tile=100, dtype="uint64") # create domain domain = tiledb.Domain(ctx, d1, d2) # create attributes a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32") a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32") # create sparse array with schema schema = tiledb.SparseArray(ctx, "sparse_array_schema", domain=domain, attrs=(a1, a2), capacity=10, tile_order='row-major', cell_order='col-major', coords_compressor=('zstd', 4), offsets_compressor=('blosc-lz', 5)) schema.dump() # Print from schema print("From schema properties:") print("- Array type: ", "sparse" if schema.sparse else "dense") print("- Cell order: ", schema.cell_order) print("- Tile order: ", schema.tile_order) print("- Capacity: ", schema.capacity) print("- Coordinates compressor: ", schema.coords_compressor) print("- Offsets compressor: ", schema.offsets_compressor) print() # Print the attribute names: print("Array schema attribute names: ") for i in range(schema.nattr): print("* {!r}".format(schema.attr(i).name)) print() # Print domain domain = schema.domain domain.dump() # print the dimension names print("Array schema dimension names: ") for i in range(schema.ndim): dim = domain.dim(i) print("* {!r}".format(dim.name)) print()
def fromtiledb(uri, ctx=None, key=None, timestamp=None, gpu=False): import tiledb raw_ctx = ctx if raw_ctx is None: ctx = tiledb.Ctx() # get metadata from tiledb try: tiledb_arr = tiledb.DenseArray(uri=uri, ctx=ctx, key=key, timestamp=timestamp) sparse = False except ValueError: # if the array is not dense, ValueError will be raised by tiledb tiledb_arr = tiledb.SparseArray(uri=uri, ctx=ctx, key=key, timestamp=timestamp) sparse = True if tiledb_arr.nattr > 1: raise NotImplementedError("Does not supported TileDB array schema " "with more than 1 attr") tiledb_dim_starts = tuple( tiledb_arr.domain.dim(j).domain[0].item() for j in range(tiledb_arr.ndim)) if any(isinstance(s, float) for s in tiledb_dim_starts): raise ValueError("Does not support TileDB array schema " "whose dimensions has float domain") dtype = tiledb_arr.attr(0).dtype tiledb_config = None if raw_ctx is None else ctx.config().dict() tensor_order = (TensorOrder.C_ORDER if tiledb_arr.schema.cell_order == "row-major" else TensorOrder.F_ORDER) op = TensorTileDBDataSource( tiledb_config=tiledb_config, tiledb_uri=uri, tiledb_key=key, tiledb_timstamp=timestamp, tiledb_dim_starts=tiledb_dim_starts, gpu=gpu, sparse=sparse, dtype=dtype, ) chunk_size = tuple( int(tiledb_arr.domain.dim(i).tile) for i in range(tiledb_arr.domain.ndim)) return op(tiledb_arr.shape, chunk_size=chunk_size, order=tensor_order)
def read_array_cloud(rest_address, array_uri, token): tiledb.cloud.login(host=rest_address, token=token) from_time = int(get_timestamp_from_text("2020-02-04 00:00:00")) print(from_time) to_time = int(get_timestamp_from_text("2020-02-04 23:59:59")) print(to_time) with tiledb.SparseArray(array_uri, ctx=tiledb.cloud.Ctx()) as A: res = A.apply( mean, [(1, 5000), (from_time, to_time), (400000000, 500000000)], attrs=["importance"], ) print(res)
def create_test_array_sparse_25x12(temp_rootdir): """ Create a simple sparse test array. """ path = os.path.abspath(os.path.join(temp_rootdir, "sparse_25x12")) ctx = tiledb.default_ctx() rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64) cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64) dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx) att = tiledb.Attr(ctx=ctx, dtype=np.int64) schema = tiledb.ArraySchema(ctx=ctx, sparse=True, domain=dom, attrs=(att,)) tiledb.SparseArray.create(path, schema) with tiledb.SparseArray(path, mode="w") as A: coords = np.array(list(itertools.product(np.arange(1, 26), np.arange(1, 13)))) rows = coords[:, 0] cols = coords[:, 1] A[rows, cols] = np.arange(300)
def write_sparse_array(path, n, m, n_idxs, m_idxs, values, clip=True): if os.path.exists(path): raise FileExistsError("{} already exists".format(path)) if n_idxs.min() < 0 or n_idxs.max() >= n: raise ValueError("row indexes must be in range [0, n - 1]") if m_idxs.min() < 0 or m_idxs.max() >= m: raise ValueError("column indexes must in in range [0, m - 1]") sparse = coo_matrix((values, (n_idxs, m_idxs)), dtype=np.int32) sparse = sparse.tocsc(copy=False).tocoo(copy=False) n_idxs = sparse.row m_idxs = sparse.col values = sparse.data if clip: values = np.minimum(values, VPLOT_MAX_VALUE) if values.min() < 0 or values.max() > VPLOT_MAX_VALUE: raise ValueError( "vplot values must be in range [0, {}]".format(VPLOT_MAX_VALUE)) # ctx = tiledb.Ctx() n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n) d1 = tiledb.Dim( GENOME_DOMAIN_NAME, domain=(0, n - 1), tile=n_tile_extent, dtype="uint32", ctx=ctx, ) d2 = tiledb.Dim(INSERT_DOMAIN_NAME, domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx) domain = tiledb.Domain(d1, d2, ctx=ctx) v = tiledb.Attr( "v", filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]), dtype="uint8", ctx=ctx, ) schema = tiledb.ArraySchema( ctx=ctx, domain=domain, attrs=(v, ), capacity=1000, cell_order="row-major", tile_order="row-major", sparse=True, ) tiledb.SparseArray.create(path, schema) with tiledb.SparseArray(path, mode="w", ctx=ctx) as A: values = values.astype(np.uint8) # A[n_idxs, m_idxs] = {"v": values} A[n_idxs, m_idxs] = values
def _open_array(uri, tiledb_ctx): with tiledb.Array(uri, mode="r", ctx=tiledb_ctx) as array: if array.schema.sparse: return tiledb.SparseArray(uri, mode="r", ctx=tiledb_ctx) else: return tiledb.DenseArray(uri, mode="r", ctx=tiledb_ctx)
def write_array(): ctx = tiledb.Ctx() with tiledb.SparseArray(ctx, array_name, mode='w') as A: I, J = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] data = np.array(([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) A[I, J] = data
def convert_matrix_to_cxg_array(matrix_name, matrix, encode_as_sparse_array, ctx, column_shift_for_sparse_encoding=None): """ Converts a numpy array matrix into a TileDB SparseArray of DenseArray based on whether `encode_as_sparse_array` is true or not. Note that when the matrix is encoded as a SparseArray, it only writes the values that are nonzero. This means that if you count the number of elements in the SparseArray, it will not equal the total number of elements in the matrix, only the number of nonzero elements. Furthermore, if the `column_shift_for_sparse_encoding` matrix is not None, this function will subtract the sparse encoding from the original given matrix and as previously stated, only write the nonzero values to the TileDB SparseArray. """ def create_matrix_array(matrix_name, number_of_rows, number_of_columns, encode_as_sparse_array): filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] if encode_as_sparse_array: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, number_of_rows - 1), tile=min(number_of_rows, 512), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, number_of_columns - 1), tile=min(number_of_columns, 2048), dtype=np.uint32), ) else: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, number_of_rows - 1), tile=min(number_of_rows, 50), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, number_of_columns - 1), tile=min(number_of_columns, 100), dtype=np.uint32), ) schema = tiledb.ArraySchema(domain=domain, sparse=encode_as_sparse_array, attrs=attrs, cell_order="row-major", tile_order="col-major") if encode_as_sparse_array: tiledb.SparseArray.create(matrix_name, schema) else: tiledb.DenseArray.create(matrix_name, schema) number_of_rows = matrix.shape[0] number_of_columns = matrix.shape[1] stride = min( int(np.power(10, np.around(np.log10(1e9 / number_of_columns)))), 10_000) create_matrix_array(matrix_name, number_of_rows, number_of_columns, encode_as_sparse_array) if encode_as_sparse_array: with tiledb.SparseArray(matrix_name, mode="w", ctx=ctx) as array: for start_row_index in range(0, number_of_rows, stride): end_row_index = min(start_row_index + stride, number_of_rows) matrix_subset = matrix[start_row_index:end_row_index, :] if not isinstance(matrix_subset, np.ndarray): matrix_subset = matrix_subset.toarray() if column_shift_for_sparse_encoding is not None: matrix_subset = matrix_subset - column_shift_for_sparse_encoding indices = np.nonzero(matrix_subset) trow = indices[0] + start_row_index array[trow, indices[1]] = matrix_subset[indices[0], indices[1]] else: with tiledb.DenseArray(matrix_name, mode="w", ctx=ctx) as array: for start_row_index in range(0, number_of_rows, stride): end_row_index = min(start_row_index + stride, number_of_rows) matrix_subset = matrix[start_row_index:end_row_index, :] if not isinstance(matrix_subset, np.ndarray): matrix_subset = matrix_subset.toarray() array[start_row_index:end_row_index, :] = matrix_subset
def testReadTileDBExecution(self): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: # create TileDB dense array dom = tiledb.Domain( tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32), tiledb.Dim(ctx=ctx, domain=(0, 90), tile=22, dtype=np.int32), tiledb.Dim(ctx=ctx, domain=(0, 9), tile=8, dtype=np.int32), ctx=ctx, ) schema = tiledb.ArraySchema( ctx=ctx, domain=dom, sparse=False, attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)]) tiledb.DenseArray.create(tempdir, schema) expected = np.random.rand(100, 91, 10) with tiledb.DenseArray(uri=tempdir, ctx=ctx, mode='w') as arr: arr.write_direct(expected) a = fromtiledb(tempdir, ctx=ctx) result = self.executor.execute_tensor(a, concat=True)[0] np.testing.assert_allclose(expected, result) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # create 2-d TileDB sparse array dom = tiledb.Domain( tiledb.Dim(ctx=ctx, domain=(0, 99), tile=30, dtype=np.int32), tiledb.Dim(ctx=ctx, domain=(2, 11), tile=8, dtype=np.int32), ctx=ctx, ) schema = tiledb.ArraySchema( ctx=ctx, domain=dom, sparse=True, attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)]) tiledb.SparseArray.create(tempdir, schema) expected = sps.rand(100, 10, density=0.01) with tiledb.SparseArray(uri=tempdir, ctx=ctx, mode='w') as arr: I, J = expected.row, expected.col + 2 arr[I, J] = {arr.attr(0).name: expected.data} a = fromtiledb(tempdir, ctx=ctx) result = self.executor.execute_tensor(a, concat=True)[0] np.testing.assert_allclose(expected.toarray(), result.toarray()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # create 1-d TileDB sparse array dom = tiledb.Domain( tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32), ctx=ctx, ) schema = tiledb.ArraySchema( ctx=ctx, domain=dom, sparse=True, attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)]) tiledb.SparseArray.create(tempdir, schema) expected = sps.rand(1, 100, density=0.05) with tiledb.SparseArray(uri=tempdir, ctx=ctx, mode='w') as arr: I = expected.col + 1 arr[I] = expected.data a = fromtiledb(tempdir, ctx=ctx) result = self.executor.execute_tensor(a, concat=True)[0] np.testing.assert_allclose(expected.toarray()[0], result.toarray()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # create TileDB dense array with column-major dom = tiledb.Domain( tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32), tiledb.Dim(ctx=ctx, domain=(0, 90), tile=22, dtype=np.int32), tiledb.Dim(ctx=ctx, domain=(0, 9), tile=8, dtype=np.int32), ctx=ctx, ) schema = tiledb.ArraySchema( ctx=ctx, domain=dom, sparse=False, cell_order='F', attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)]) tiledb.DenseArray.create(tempdir, schema) expected = np.asfortranarray(np.random.rand(100, 91, 10)) with tiledb.DenseArray(uri=tempdir, ctx=ctx, mode='w') as arr: arr.write_direct(expected) a = fromtiledb(tempdir, ctx=ctx) result = self.executor.execute_tensor(a, concat=True)[0] np.testing.assert_allclose(expected, result) self.assertTrue(result.flags['F_CONTIGUOUS']) self.assertFalse(result.flags['C_CONTIGUOUS']) finally: shutil.rmtree(tempdir)
def save_X(container, xdata, ctx, sparse_threshold, expect_sparse=False): # Save X count matrix X_name = f"{container}/X" shape = xdata.shape log(1, "\t...shape:", str(shape)) col_shift = None if sparse_threshold == 100: is_sparse = True elif sparse_threshold == 0: is_sparse = False else: is_sparse, nnz, nelem = evaluate_for_sparse_encoding(xdata, sparse_threshold) percent = 100.0 * nnz / nelem if nelem != shape[0] * shape[1]: log(1, "\t...sparse=", is_sparse, "non-zeros percent (estimate): %6.2f" % percent) else: log(1, "\t...sparse=", is_sparse, "non-zeros:", nnz, "percent: %6.2f" % percent) is_sparse = percent < sparse_threshold if not is_sparse: col_shift, nnz, nelem = evaluate_for_sparse_column_shift_encoding(xdata, sparse_threshold) is_sparse = col_shift is not None percent = 100.0 * nnz / nelem if nelem != shape[0] * shape[1]: log(1, "\t...sparse=", is_sparse, "col shift non-zeros percent (estimate): %6.2f" % percent) else: log(1, "\t...sparse=", is_sparse, "col shift non-zeros:", nnz, "percent: %6.2f" % percent) if expect_sparse is True and is_sparse is False: return False create_X(X_name, shape, is_sparse) stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000) if is_sparse: if col_shift is not None: log(1, "\t...output X as sparse matrix with column shift encoding") X_col_shift_name = f"{container}/X_col_shift" filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] domain = tiledb.Domain(tiledb.Dim(domain=(0, shape[1] - 1), tile=min(shape[1], 5000), dtype=np.uint32)) schema = tiledb.ArraySchema(domain=domain, attrs=attrs) tiledb.DenseArray.create(X_col_shift_name, schema) with tiledb.DenseArray(X_col_shift_name, mode="w", ctx=ctx) as X_col_shift: X_col_shift[:] = col_shift tiledb.consolidate(X_col_shift_name, ctx=ctx) else: log(1, "\t...output X as sparse matrix") with tiledb.SparseArray(X_name, mode="w", ctx=ctx) as X: nnz = 0 for row in range(0, shape[0], stride): lim = min(row + stride, shape[0]) a = xdata[row:lim, :] if type(a) is not np.ndarray: a = a.toarray() if col_shift is not None: a = a - col_shift indices = np.nonzero(a) trow = indices[0] + row nnz += indices[0].shape[0] X[trow, indices[1]] = a[indices[0], indices[1]] log(2, "\t...rows", lim, "of", shape[0], "nnz", nnz, "sparse", nnz / (lim * shape[1])) else: log(1, "\t...output X as dense matrix") with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X: for row in range(0, shape[0], stride): lim = min(row + stride, shape[0]) a = xdata[row:lim, :] if type(a) is not np.ndarray: a = a.toarray() X[row:lim, :] = a log(2, "\t...rows", row, "to", lim) tiledb.consolidate(X_name, ctx=ctx) if hasattr(tiledb, "vacuum"): tiledb.vacuum(X_name) return is_sparse
def load_sparse_array(path): # ctx = tiledb.Ctx() return tiledb.SparseArray(path, mode="r", ctx=ctx)