def uri(temp_rootdir): """ Create a simple dense test array. """ path = os.path.abspath(os.path.join(temp_rootdir, "test_array")) ctx = tiledb.default_ctx() rows_dim = tiledb.Dim(ctx=ctx, domain=(1, 25), dtype=np.int64) cols_dim = tiledb.Dim(ctx=ctx, domain=(1, 12), dtype=np.int64) dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx) att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64) att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64) schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att1, att2)) tiledb.DenseArray.create(path, schema) data = np.reshape(np.arange(300), (25, 12)) with tiledb.DenseArray(path, mode="w", timestamp=1) as A: A.meta["meta_int"] = 1 A[:] = {"a": data, "b": data} with tiledb.DenseArray(path, mode="w", timestamp=2) as A: A.meta["meta_int"] = 2 A[:] = {"a": data / 2, "b": data * 2} yield path shutil.rmtree(path)
def test_store_tiledb_execution(setup): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: # store TileDB dense array expected = np.random.rand(8, 4, 3) a = tensor(expected, chunk_size=(3, 3, 2)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(expected, arr.read_direct()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store tensor with 1 chunk to TileDB dense array a = arange(12) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(np.arange(12), arr.read_direct()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store 2-d TileDB sparse array expected = sps.random(8, 7, density=0.1) a = tensor(expected, chunk_size=(3, 5)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr: data = arr[:, :] coords = data['coords'] value = data[arr.attr(0).name] ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim)) result = sps.coo_matrix((value, ij), shape=arr.shape) np.testing.assert_allclose(expected.toarray(), result.toarray()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store TileDB dense array expected = np.asfortranarray(np.random.rand(8, 4, 3)) a = tensor(expected, chunk_size=(3, 3, 2)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(expected, arr.read_direct()) assert arr.schema.cell_order == 'col-major' finally: shutil.rmtree(tempdir)
def testStoreTileDB(self): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: t = random.rand(50, 30, chunk_size=13) t2 = t + 1 saved = totiledb(tempdir, t2) self.assertEqual(saved.shape, (0, 0)) self.assertIsNone(saved.op.tiledb_config) self.assertEquals(saved.op.tiledb_uri, tempdir) with self.assertRaises(tiledb.TileDBError): tiledb.DenseArray(ctx=ctx, uri=tempdir) # tiledb array is created in the tile saved.tiles() # no error tiledb.DenseArray(ctx=ctx, uri=tempdir) self.assertEqual(saved.chunks[0].op.axis_offsets, (0, 0)) self.assertEqual(saved.chunks[1].op.axis_offsets, (0, 13)) self.assertEqual(saved.cix[0, 2].op.axis_offsets, (0, 26)) self.assertEqual(saved.cix[1, 2].op.axis_offsets, (13, 26)) self.assertEqual(saved.cix[3, 2].op.axis_offsets, (39, 26)) with self.assertRaises(ValueError): t3 = random.rand(30, 50) totiledb(tempdir, t3, ctx=ctx) # shape incompatible finally: shutil.rmtree(tempdir)
def open(uri: Union[str, Path]) -> TileSegy: uri = Path(uri) if not isinstance(uri, Path) else uri headers = tiledb.DenseArray(str(uri / "headers")) data = tiledb.DenseArray(str(uri / "data"), attr="trace") if data.schema.domain.has_dim("traces"): cls = TileSegy else: cls = StructuredTileSegy return cls(uri, headers, data)
def write_array(args, updating, chunks_to_process): try: #config tdb_Config=tiledb.Config(tdb_config_params) tdb_write_Context=tiledb.Ctx(config=tdb_Config) if updating is True: tdb_read_Context=tiledb.Ctx(config=tdb_Config) cur_array_toread=tiledb.DenseArray(args.array_name,ctx=tdb_read_Context,mode='r') cur_array_towrite=tiledb.DenseArray(args.array_name,ctx=tdb_write_Context,mode='w') chunks_processed=0 while chunks_processed < chunks_to_process: while write_queue.empty() is True: time.sleep(10) processed_chunk=write_queue.get() processed_chunk_unpickled=pickle.loads(processed_chunk) task_index=processed_chunk_unpickled[0] start_index=processed_chunk_unpickled[1] end_index=processed_chunk_unpickled[2] dict_to_write=processed_chunk_unpickled[3] if updating is True: #we are only updating some attributes in the array cur_vals=cur_array_toread[start_index:end_index,task_index] #print("got cur vals for task "+str(task_index)+" for "+str(start_index)+":"+str(end_index)) for key in dict_to_write: cur_vals[key]=dict_to_write[key] dict_to_write=cur_vals print("updated data dict for writing:"+args.array_name) else: #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array required_attrib=list(get_attribute_info(args.attribute_config,args.attribute_config_file).keys()) #print(str(required_attrib)) for attrib in required_attrib: if attrib not in dict_to_write: print("augmenting") dict_to_write[attrib]=np.full(end_index-start_index,np.nan) #write in chunks cur_array_towrite[start_index:end_index,task_index]=dict_to_write print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) gc.collect() chunks_processed+=1 print("wrote to disk "+str(task_index)+" for "+str(start_index)+":"+str(end_index)+";"+str(chunks_processed)+"/"+str(chunks_to_process)) assert chunks_processed >=chunks_to_process print("closing arrays") if updating is True: cur_array_toread.close() cur_array_towrite.close() return except KeyboardInterrupt: kill_child_processes(os.getpid()) #try to delete all tmp files raise except Exception as e: print(e) kill_child_processes(os.getpid()) raise Exception(e.message)
def test_ingest_csv_dense_array( udf_uri, array_name, key, secret, namespace, bucket, config ): """ Create a dense array from a CSV file using ingest_csv(). """ tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, sparse=False, name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.DenseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) for col, attribute in enumerate(("a", "b", "c"), 1): assert_array_equal( data[attribute], np.array([row * 10 + col for row in range(1, 21)]), )
def load_dense_array(self, arrayID): tile_array_id = os.path.join(self.root, arrayID) try: return tiledb.DenseArray(self.ctx, tile_array_id) except tiledb.TileDBError as e: print(e) return np.array([])
def create_tiledb_datetime_example(tmpdir): _data = np.linspace(-1.0, 20.0, num=16, endpoint=True, dtype=np.float64) _date = np.arange(np.datetime64("2000-01-01"), np.datetime64("2000-01-17")) # Create expected dataset expected = xr.Dataset( data_vars={"temperature": xr.DataArray(data=_data, dims="date")}, coords={"date": _date}, ) # Create TileDB array array_uri = str(tmpdir.join("tiledb_example_2")) schema = tiledb.ArraySchema( domain=tiledb.Domain( tiledb.Dim( name="date", domain=(np.datetime64("2000-01-01"), np.datetime64("2000-01-16")), tile=np.timedelta64(4, "D"), dtype=np.datetime64("", "D"), ), ), attrs=[tiledb.Attr(name="temperature", dtype=np.float64)], ) tiledb.DenseArray.create(array_uri, schema) with tiledb.DenseArray(array_uri, mode="w") as array: array[:] = {"temperature": _data} return array_uri, expected
def execute(cls, ctx, op): tiledb_ctx = get_tiledb_ctx(op.tiledb_config) uri = op.tiledb_uri key = op.tiledb_key timestamp = op.tiledb_timestamp axis_offsets = op.axis_offsets chunk = op.outputs[0] if not chunk.issparse(): # dense to_store = np.ascontiguousarray(ctx[op.input.key]) slcs = [] for axis in range(chunk.ndim): axis_offset = int(axis_offsets[axis]) axis_length = int(op.input.shape[axis]) slcs.append(slice(axis_offset, axis_offset + axis_length)) with tiledb.DenseArray(uri=uri, ctx=tiledb_ctx, mode='w', key=key, timestamp=timestamp) as arr: arr[tuple(slcs)] = to_store ctx[chunk.key] = np.empty((0,) * chunk.ndim, dtype=chunk.dtype) else: # sparse to_store = ctx[op.input.key].spmatrix.tocoo() if to_store.nnz > 0: with tiledb.SparseArray(uri=uri, ctx=tiledb_ctx, mode='w', key=key, timestamp=timestamp) as arr: if chunk.ndim == 1: vec = to_store.col if to_store.shape[0] == 1 else to_store.row vec += axis_offsets[0] arr[vec] = to_store.data else: i, j = to_store.row + axis_offsets[0], to_store.col + axis_offsets[1] arr[i, j] = to_store.data ctx[chunk.key] = SparseNDArray(sps.csr_matrix((0, 0), dtype=chunk.dtype), shape=chunk.shape)
def _initialize_stat_values_store_if_needed( self, shape: Tuple[int, ...]) -> None: """ Initialize storage for the benchmark statistics if it wasn't created yet. :param shape: Shape of the stats map. """ if self.__tiledb_stats_array is not None and tiledb.array_exists( self.__tiledb_stats_array): return # Create array with one dense dimension to store read statistics from the latest benchmark run. dom = tiledb.Domain( tiledb.Dim(name='n', domain=(0, shape[0] - 1), tile=shape[0] - 1, dtype=np.int64), tiledb.Dim(name='f', domain=(0, shape[1] - 1), tile=(shape[1] - 1), dtype=np.int64)) # Schema contains one attribute for READ count schema = tiledb.ArraySchema( domain=dom, sparse=False, attrs=[tiledb.Attr(name='read', dtype=np.int32)]) # Create the (empty) array on disk. tiledb.DenseArray.create(self.__tiledb_stats_array, schema) # Fill with zeroes with tiledb.DenseArray(self.__tiledb_stats_array, mode='w') as rr: zero_data = np.zeros(shape, dtype=np.int32) rr[:] = zero_data
def _write_stats(self, stats: np.ndarray) -> None: """ Write benchmark stats to the local storage :param stats: Expected array must have shape (num of nodes, num of fragments, 1) """ with tiledb.DenseArray(self.__tiledb_stats_array, mode='w') as rr: rr[:] = stats
def _tiledb_array(self, uri: str, schema: tiledb.ArraySchema) -> Iterator[tiledb.Array]: tiledb.DenseArray.create(uri, schema) with tiledb.DenseArray(uri, mode="w") as tdb: yield tdb tiledb.consolidate(uri, config=self.config) tiledb.vacuum(uri, config=self.config)
def convert_ndarray_to_cxg_dense_array(ndarray_name, ndarray, ctx): """ Saves contents of ndarray to the CXG output directory specified. Generally this function is used to convert dataset embeddings. Because embeddings are typically accessed with very large slices (or all of the embedding), they do not benefit from overly aggressive compression due to their format. Given this, we use a large tile size (1000) but only default compression level. """ def create_ndarray_array(ndarray_name, ndarray): filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=ndarray.dtype, filters=filters)] dimensions = [ tiledb.Dim(domain=(0, ndarray.shape[dimension] - 1), tile=min(ndarray.shape[dimension], 1000), dtype=np.uint32) for dimension in range(ndarray.ndim) ] domain = tiledb.Domain(*dimensions) schema = tiledb.ArraySchema(domain=domain, sparse=False, attrs=attrs, capacity=1_000_000, cell_order="row-major", tile_order="row-major") tiledb.DenseArray.create(ndarray_name, schema) create_ndarray_array(ndarray_name, ndarray) with tiledb.DenseArray(ndarray_name, mode="w", ctx=ctx) as array: array[:] = ndarray tiledb.consolidate(ndarray_name, ctx=ctx)
def write_array(): # Open the array and write to it. with tiledb.DenseArray(array_name, mode='w') as A: data = np.array(([1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16])) A[:] = data
def threadtest_create_array( uri, ): data = np.random.rand(20) schema = tiledb.libtiledb.schema_like(data) tiledb.Array.create(uri, schema) with tiledb.DenseArray(uri, "w") as A: A[:] = data
def calculate_change(_input, bands, window, x, y, tile_x_size, tile_y_size, output, config=None): # assuming average reflectivities in the entire two images are ~ equal # https://prod-ng.sandia.gov/techlib-noauth/access-control.cgi/2014/1418179.pdf # noise terms are known and are zero (uavsar, extend as we add additional sensors) cfg = tiledb.Config(config) ctx = tiledb.Ctx(config=cfg) with tiledb.DenseArray(output, 'w', ctx=ctx) as arr_output: with tiledb.DenseArray(_input, 'r', ctx=ctx) as arr: start_y = y * tile_y_size end_y = start_y + tile_y_size start_x = x * tile_x_size end_x = start_x + tile_x_size data = arr.query(attrs=['TDB_VALUES'])[:, start_y:end_y, start_x:end_x] # noqa tile = data["TDB_VALUES"] out_tile = np.ones((tile.shape[1], tile.shape[2]), dtype=np.float32) # noqa y1 = 0 while y1 < tile_y_size: x1 = 0 y1_end = y1 + window while x1 < tile_x_size: x1_end = x1 + window t1 = tile[0, y1:y1_end, x1:x1_end] t2 = tile[1, y1:y1_end, x1:x1_end] # write result to tiledb output array out_tile[y1:y1_end, x1:x1_end] = local_ccd(t1, t2) # noqa x1 = x1 + window y1 = y1 + window # write out result tile arr_output[start_y:end_y, start_x:end_x] = out_tile return True
def execute(cls, ctx, op): import tiledb chunk = op.outputs[0] from ..array_utils import array_module from ..utils import get_tiledb_ctx xp = array_module(op.gpu) axis_offsets = [ offset + dim_start for offset, dim_start in zip(op.axis_offsets, op.tiledb_dim_starts) ] tiledb_ctx = get_tiledb_ctx(op.tiledb_config) uri = op.tiledb_uri key = op.tiledb_key timestamp = op.tiledb_timestamp slcs = [] for axis in range(chunk.ndim): axis_offset = axis_offsets[axis] axis_length = chunk.shape[axis] slcs.append(slice(axis_offset, axis_offset + axis_length)) if not op.sparse: # read dense array from tiledb with tiledb.DenseArray(uri=uri, ctx=tiledb_ctx, key=key, timestamp=timestamp) as tiledb_arr: ctx[chunk.key] = tiledb_arr[tuple(slcs)] else: # read sparse array from tiledb with tiledb.SparseArray(uri=uri, ctx=tiledb_ctx, key=key, timestamp=timestamp) as tiledb_arr: if tiledb_arr.ndim > 2: raise NotImplementedError( 'Does not support to read array with more than 2 dimensions' ) data = tiledb_arr[tuple(slcs)] coords = data['coords'] value = data[tiledb_arr.attr(0).name] if tiledb_arr.ndim == 2: # 2-d ij = tuple(coords[tiledb_arr.domain.dim(k).name] - axis_offsets[k] for k in range(tiledb_arr.ndim)) spmatrix = sps.coo_matrix((value, ij), shape=chunk.shape) ctx[chunk.key] = SparseNDArray(spmatrix) else: # 1-d ij = xp.zeros(coords.shape), \ coords[tiledb_arr.domain.dim(0).name] - axis_offsets[0] spmatrix = sps.coo_matrix((value, ij), shape=(1, ) + chunk.shape) ctx[chunk.key] = SparseNDArray(spmatrix, shape=chunk.shape)
def test_ingest_csv_dense_array_apppend( udf_uri, array_name, key, secret, namespace, bucket, config ): tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, mode="ingest", full_domain=True, sparse=False, name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.DenseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) number_of_rows = data.shape[0] assert number_of_rows == 20 tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, array_name), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, mode="append", row_start_idx=number_of_rows, name=udf_uri, # unittest/test_ingest_csv --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.DenseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) for col, attribute in enumerate(("a", "b", "c"), 1): assert_array_equal( data[attribute], np.array([row * 10 + col for row in range(1, 21)] * 2), )
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="input cxg directory") parser.add_argument("output", help="output cxg directory") parser.add_argument("--overwrite", action="store_true", help="replace output cxg directory") parser.add_argument("--verbose", "-v", action="count", default=0, help="verbose output") parser.add_argument( "--sparse-threshold", "-s", type=float, default=5.0, # default is 5% non-zero values help= "The X array will be sparse if the percent of non-zeros falls below this value", ) args = parser.parse_args() if os.path.exists(args.output): print("output dir exists:", args.output) if args.overwrite: print("output dir removed:", args.output) shutil.rmtree(args.output) else: print("use the overwrite option to remove the output directory") sys.exit(1) if not os.path.isdir(args.input): print("input is not a directory", args.input) sys.exit(1) shutil.copytree(args.input, args.output, ignore=shutil.ignore_patterns("X", "X_col_shift")) ctx = tiledb.Ctx({ "sm.num_reader_threads": 32, "sm.num_writer_threads": 32, "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024, }) with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r", ctx=ctx) as X_in: is_sparse = cxgtool.save_X(args.output, X_in, ctx, args.sparse_threshold, expect_sparse=True) if is_sparse is False: print("The array is not sparse, cleaning up, abort.") shutil.rmtree(args.output) sys.exit(1)
def save_embeddings(container, adata, ctx): for (name, value) in adata.obsm.items(): if is_valid_embedding(adata, name, value): e_name = f"{container}/{name[2:]}" create_emb(e_name, value) with tiledb.DenseArray(e_name, mode="w", ctx=ctx) as A: A[:] = value tiledb.consolidate(e_name, ctx=ctx) log(1, f"\t\t...{name} embedding created")
def get_loss_weights(tdb_path,chrom,label_attribute,ambig_attribute,upsample_attribute,tdb_partition_thresh_for_upsample): import tiledb from kerasAC.tiledb_config import get_default_config import pdb tdb_config=get_default_config() ctx=tiledb.Ctx(tdb_config) tdb_array=tiledb.DenseArray(tdb_path+"."+chrom,mode='r',ctx=ctx) print("opened:"+tdb_path+"."+chrom+" for reading") vals=tdb_array[:] print("got tdb vals")
def exec_and_fetch( query, output_uri, output_schema=None, namespace=None, task_name=None, output_array_name=None, init_commands=None, parameters=None, ): """ Run a sql query, results are not stored :param str query: query to run :param str output_uri: array to store results to, must be either a tiledb:// for an already registered array or a s3:// if passing a new schema to create new output array :param tiledb.ArraySchema output_schema: array schema to create output array with :param str namespace: optional namespace to charge the query to :param str task_name: optional name to assign the task for logging and audit purposes :param str output_array_name: optional name for registering new output array if output_schema schema is passed :param list init_commands: optional list of sql queries or commands to run before main query :param list parameters: optional list of sql parameters for use in query :return: TileDB Array with results """ # If the namespace is not set, we will default to the user's namespace if namespace is None: # Fetch the client profile for username if it is not already cached if config.user is None: config.user = client.user_profile() namespace = config.user.username # Execute the sql query try: exec( query=query, output_uri=output_uri, output_schema=output_schema, namespace=namespace, task_name=task_name, output_array_name=output_array_name, init_commands=init_commands, parameters=parameters, ) # Fetch output schema to check if its sparse or dense schema = tiledb.ArraySchema.load(output_uri, ctx=client.Ctx()) if schema.sparse: return tiledb.SparseArray(output_uri, ctx=client.Ctx()) return tiledb.DenseArray(output_uri, ctx=client.Ctx()) except GenApiException as exc: raise tiledb_cloud_error.check_exc(exc) from None
def process_chrom(data_dict,attribute_info,chrom,size,array_out_name,updating,args): attribute_config=args.attribute_config dict_to_write=OrderedDict() for attribute in data_dict: cur_parser=attribute_info[attribute]['parser'] dict_to_write[attribute]=cur_parser(data_dict[attribute],chrom,0,size,attribute_info[attribute]) print("got:"+str(attribute)+" for chrom:"+str(chrom)) if updating is True: #we are only updating some attributes in the array with tiledb.DenseArray(array_out_name,mode='r',ctx=tdb_Context) as cur_array: #with tiledb.DenseArray(array_out_name,mode='r',ctx=tiledb.Ctx(config=tdb_Config)) as cur_array: cur_vals=cur_array[:] del cur_array print('got cur vals for'+array_out_name) for key in dict_to_write: cur_vals[key]=dict_to_write[key] dict_to_write=cur_vals print("updated data dict for writing:"+array_out_name) else: #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array required_attrib=list(get_attribute_info(attribute_config).keys()) for attrib in required_attrib: if attrib not in dict_to_write: dict_to_write[attrib]=np.full(size,np.nan) with tiledb.DenseArray(array_out_name,ctx=tdb_Context,mode='w') as out_array: #with tiledb.DenseArray(array_out_name,ctx=tiledb.Ctx(config=tdb_Config),mode='w') as out_array: if args.write_chunk is None: #write the full chromosome out_array[:]=dict_to_write else: #write in chunks for chunk_index in range(0,size+args.write_chunk,args.write_chunk): start_pos=chunk_index if start_pos<size: end_pos=min([size,chunk_index+args.write_chunk]) out_array[start_pos:end_pos]=get_subdict(dict_to_write,start_pos,end_pos) print("wrote:"+str(start_pos)+"-"+str(end_pos)+ " for:"+array_out_name) del out_array gc.collect() print("wrote to disk:"+array_out_name)
def read_array(): # Open the array and read from it. with tiledb.DenseArray(array_name, mode='r') as A: # Slice only rows 1, 2 and cols 2, 3, 4. data = A[1:3, 2:5] print(data["a"]) sch = A.schema #print (sch.domain, sch.ndim, sch.) #print (sch.capacity) sch.dump()
def ccd(_input, bands, output=None, config=None, neighbourhood=7, overlap=1): if len(bands) == 2: if output is None or not os.path.exists(output): cfg = tiledb.Config(config) ctx = tiledb.Ctx(config=cfg) with tiledb.DenseArray(_input, 'r', ctx=ctx) as arr: y_dim = arr.schema.domain.dim(1) x_dim = arr.schema.domain.dim(2) height = y_dim.size width = x_dim.size tile_y_size = y_dim.tile tile_x_size = x_dim.tile dom = tiledb.Domain( tiledb.Dim(domain=(0, height - 1), tile=tile_y_size, dtype=np.uint64), tiledb.Dim(domain=(0, width - 1), tile=tile_x_size, dtype=np.uint64)) schema = tiledb.ArraySchema( domain=dom, sparse=False, attrs=[tiledb.Attr(name="c", dtype=np.float32)], ctx=ctx) if output is None: output = _input + '_result_' + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(4)) # noqa tiledb.DenseArray.create(output, schema) x = da.from_tiledb(_input, storage_options=config) _, h, w = x.shape _, tile_y_size, tile_x_size = x.chunksize # w and h are an exact multiple of tile size n_tiles_x = w // tile_x_size n_tiles_y = h // tile_x_size # manually chunk and collect f = [] for y in range(n_tiles_y): for x in range(n_tiles_x): f.append( client.submit(calculate_change, _input, bands, neighbourhood, x, y, tile_x_size, tile_y_size, output, config)) client.gather(f) return output else: raise IndexError('CCD function requires two band indexes')
def write_tiledb(arr, path, overwrite=True): """Write a tiledb to disk. """ if os.path.exists(path) and os.path.isdir(path) and overwrite: shutil.rmtree(path) if os.path.exists(path): raise FileExistsError("Output path {} already exists".format(path)) ctx = tiledb.Ctx() n = arr.shape[0] n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n) d1 = tiledb.Dim(ctx, GENOME_DOMAIN_NAME, domain=(0, n - 1), tile=n_tile_extent, dtype="uint32") if arr.ndim == 1: domain = tiledb.Domain(ctx, d1) elif arr.ndim == 2: m = arr.shape[1] d2 = tiledb.Dim(ctx, SECONDARY_DOMAIN_NAME, domain=(0, m - 1), tile=m, dtype="uint32") domain = tiledb.Domain(ctx, d1, d2) else: raise ValueError("tiledb backend only supports 1D or 2D arrays") v = tiledb.Attr( ctx, GENOME_VALUE_NAME, compressor=(DEFAULT_COMPRESSOR, DEFAULT_COMPRESSOR_LEVEL), dtype="float32", ) schema = tiledb.ArraySchema(ctx, domain=domain, attrs=(v, ), cell_order="row-major", tile_order="row-major") A = tiledb.DenseArray.create(path, schema) values = arr.astype(np.float32) with tiledb.DenseArray(ctx, path, mode="w") as A: A[:] = {GENOME_VALUE_NAME: values}
def test_store_tile_db(): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: t = random.rand(50, 30, chunk_size=13) t2 = t + 1 saved = totiledb(tempdir, t2) assert saved.shape == (0, 0) assert saved.op.tiledb_config is None assert saved.op.tiledb_uri == tempdir with pytest.raises(tiledb.TileDBError): tiledb.DenseArray(ctx=ctx, uri=tempdir) # tiledb array is created in the tile saved = tile(saved) # no error tiledb.DenseArray(ctx=ctx, uri=tempdir) # TileDB consolidation assert len(saved.chunks) == 1 assert saved.chunks[0].inputs[0].op.axis_offsets == (0, 0) assert saved.chunks[0].inputs[1].op.axis_offsets == (0, 13) assert saved.chunks[0].inputs[2].op.axis_offsets == (0, 26 ) # input (0, 2) assert saved.chunks[0].inputs[5].op.axis_offsets == (13, 26 ) # input (1, 2) assert saved.chunks[0].inputs[11].op.axis_offsets == (39, 26 ) # input (3, 2) with pytest.raises(ValueError): t3 = random.rand(30, 50) totiledb(tempdir, t3, ctx=ctx) # shape incompatible finally: shutil.rmtree(tempdir)
def fromtiledb(uri, ctx=None, key=None, timestamp=None, gpu=False): import tiledb raw_ctx = ctx if raw_ctx is None: ctx = tiledb.Ctx() # get metadata from tiledb try: tiledb_arr = tiledb.DenseArray(uri=uri, ctx=ctx, key=key, timestamp=timestamp) sparse = False except ValueError: # if the array is not dense, ValueError will be raised by tiledb tiledb_arr = tiledb.SparseArray(uri=uri, ctx=ctx, key=key, timestamp=timestamp) sparse = True if tiledb_arr.nattr > 1: raise NotImplementedError("Does not supported TileDB array schema " "with more than 1 attr") tiledb_dim_starts = tuple( tiledb_arr.domain.dim(j).domain[0].item() for j in range(tiledb_arr.ndim)) if any(isinstance(s, float) for s in tiledb_dim_starts): raise ValueError("Does not support TileDB array schema " "whose dimensions has float domain") dtype = tiledb_arr.attr(0).dtype tiledb_config = None if raw_ctx is None else ctx.config().dict() tensor_order = (TensorOrder.C_ORDER if tiledb_arr.schema.cell_order == "row-major" else TensorOrder.F_ORDER) op = TensorTileDBDataSource( tiledb_config=tiledb_config, tiledb_uri=uri, tiledb_key=key, tiledb_timstamp=timestamp, tiledb_dim_starts=tiledb_dim_starts, gpu=gpu, sparse=sparse, dtype=dtype, ) chunk_size = tuple( int(tiledb_arr.domain.dim(i).tile) for i in range(tiledb_arr.domain.ndim)) return op(tiledb_arr.shape, chunk_size=chunk_size, order=tensor_order)
def create_test_array_dense_25x12_mult(temp_rootdir): """ Create a simple dense test array. """ path = os.path.abspath(os.path.join(temp_rootdir, "dense_25x12_mult")) ctx = tiledb.default_ctx() rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64) cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64) dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx) att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64) att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64) schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att1, att2)) tiledb.DenseArray.create(path, schema) data = np.reshape(np.arange(300), (25, 12)) with tiledb.DenseArray(path, mode="w", timestamp=1) as A: A[:] = {"a": data, "b": data} with tiledb.DenseArray(path, mode="w", timestamp=2) as A: A[:] = {"a": data / 2, "b": data * 2}
def convert_dataframe_to_cxg_array(cxg_container, dataframe_name, dataframe, index_column_name, ctx): """ Saves the contents of the dataframe to the CXG output directory specified. Current access patterns are oriented toward reading very large slices of the dataframe, one attribute at a time. Attribute data also tends to be (often) repetitive (bools, categories, strings). Given this, we use a large tile size (1000) and very aggressive compression levels. """ def create_dataframe_array(array_name, dataframe): tiledb_filter = tiledb.FilterList([ # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and # other non-float data. tiledb.ZstdFilter(level=22), ]) attrs = [ tiledb.Attr(name=column, dtype=get_dtype_of_array(dataframe[column]), filters=tiledb_filter) for column in dataframe ] domain = tiledb.Domain( tiledb.Dim(domain=(0, dataframe.shape[0] - 1), tile=min(dataframe.shape[0], 1000), dtype=np.uint32)) schema = tiledb.ArraySchema(domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major") tiledb.DenseArray.create(array_name, schema) array_name = f"{cxg_container}/{dataframe_name}" create_dataframe_array(array_name, dataframe) with tiledb.DenseArray(array_name, mode="w", ctx=ctx) as array: value = {} schema_hints = {} for column_name, column_values in dataframe.items(): dtype, hints = get_dtype_and_schema_of_array(column_values) value[column_name] = column_values.to_numpy(dtype=dtype) if hints: schema_hints.update({column_name: hints}) schema_hints.update({"index": index_column_name}) array[:] = value array.meta["cxg_schema"] = json.dumps(schema_hints) tiledb.consolidate(array_name, ctx=ctx)