def test_tiledb_test(): import tiledb n = 1000 m = 1000 num_vals = 1000 n_idxs = np.sort(np.random.choice(n, num_vals, replace=False)) m_idxs = np.sort(np.random.choice(m, num_vals, replace=False)) values = np.random.randint(0, 100, num_vals, np.uint8) ctx = tiledb.Ctx() n_tile_extent = min(100, n) d1 = tiledb.Dim("ndom", domain=(0, n - 1), tile=n_tile_extent, dtype="uint32", ctx=ctx) d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx) domain = tiledb.Domain(d1, d2, ctx=ctx) v = tiledb.Attr( "v", filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]), dtype="uint8", ctx=ctx, ) schema = tiledb.ArraySchema( domain=domain, attrs=(v, ), capacity=10000, cell_order="row-major", tile_order="row-major", sparse=True, ctx=ctx, ) with tempfile.TemporaryDirectory() as tdir: path = os.path.join(tdir, "arr.tiledb") tiledb.SparseArray.create(path, schema) with tiledb.SparseArray(path, mode="w", ctx=ctx) as A: A[n_idxs, m_idxs] = values ctx2 = tiledb.Ctx() s = tiledb.SparseArray(path, mode="r", ctx=ctx2) vs1 = s[1:10, 1:50] _ = s[:, :] vs2 = s[1:10, 1:50] assert vs1["v"].shape[0] == vs2["v"].shape[0]
def to_tiledb(self, uri: Union[str, PurePath]) -> None: uri = URL(uri) if not isinstance(uri, PurePath) else uri if tiledb.object_type(str(uri)) != "group": tiledb.group_create(str(uri)) headers_uri = str(uri / "headers") if tiledb.object_type(headers_uri) != "array": dims = self._get_dims(TRACE_FIELDS_SIZE) header_schema = tiledb.ArraySchema( domain=tiledb.Domain(*dims), sparse=False, attrs=[ tiledb.Attr(f.name, f.dtype, filters=TRACE_FIELD_FILTERS) for f in TRACE_FIELDS ], ) with self._tiledb_array(headers_uri, header_schema) as tdb: self._fill_headers(tdb) data_uri = str(uri / "data") if tiledb.object_type(data_uri) != "array": samples = len(self.segy_file.samples) sample_dtype = self.segy_file.dtype sample_size = sample_dtype.itemsize dims = list(self._get_dims(sample_size * samples)) dims.append( tiledb.Dim( name="samples", domain=(0, samples - 1), dtype=dims[0].dtype, tile=np.clip(self.tile_size // sample_size, 1, samples), )) data_schema = tiledb.ArraySchema( domain=tiledb.Domain(*dims), sparse=False, attrs=[ tiledb.Attr("trace", sample_dtype, filters=(tiledb.LZ4Filter(), )) ], ) with self._tiledb_array(data_uri, data_schema) as tdb: self._fill_data(tdb)
def test_attr_filters_multi(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --attr-filters <attr name>:<filter name>,<filter name>,... """ test_name, _ = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_attr_filters_multi.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--attr-filters", ("a:LZ4Filter=10,BitShuffleFilter;" "b:DoubleDeltaFilter,PositiveDeltaFilter=3"), ], ) print(result.stdout) assert result.exit_code == 0 with tiledb.open(uri) as array: assert array.schema.attr("a").filters.nfilters == 2 assert array.schema.attr("a").filters[0] == tiledb.LZ4Filter(10) assert array.schema.attr( "a").filters[1] == tiledb.BitShuffleFilter() assert array.schema.attr("b").filters.nfilters == 2 assert array.schema.attr( "b").filters[0] == tiledb.DoubleDeltaFilter() assert array.schema.attr( "b").filters[1] == tiledb.PositiveDeltaFilter(3) assert array.schema.attr("c").filters.nfilters == 0 assert array.schema.attr("date").filters.nfilters == 0
def write_sparse_array(path, n, m, n_idxs, m_idxs, values, clip=True): if os.path.exists(path): raise FileExistsError("{} already exists".format(path)) if n_idxs.min() < 0 or n_idxs.max() >= n: raise ValueError("row indexes must be in range [0, n - 1]") if m_idxs.min() < 0 or m_idxs.max() >= m: raise ValueError("column indexes must in in range [0, m - 1]") sparse = coo_matrix((values, (n_idxs, m_idxs)), dtype=np.int32) sparse = sparse.tocsc(copy=False).tocoo(copy=False) n_idxs = sparse.row m_idxs = sparse.col values = sparse.data if clip: values = np.minimum(values, VPLOT_MAX_VALUE) if values.min() < 0 or values.max() > VPLOT_MAX_VALUE: raise ValueError( "vplot values must be in range [0, {}]".format(VPLOT_MAX_VALUE)) # ctx = tiledb.Ctx() n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n) d1 = tiledb.Dim( GENOME_DOMAIN_NAME, domain=(0, n - 1), tile=n_tile_extent, dtype="uint32", ctx=ctx, ) d2 = tiledb.Dim(INSERT_DOMAIN_NAME, domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx) domain = tiledb.Domain(d1, d2, ctx=ctx) v = tiledb.Attr( "v", filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]), dtype="uint8", ctx=ctx, ) schema = tiledb.ArraySchema( ctx=ctx, domain=domain, attrs=(v, ), capacity=1000, cell_order="row-major", tile_order="row-major", sparse=True, ) tiledb.SparseArray.create(path, schema) with tiledb.SparseArray(path, mode="w", ctx=ctx) as A: values = values.astype(np.uint8) # A[n_idxs, m_idxs] = {"v": values} A[n_idxs, m_idxs] = values
size2dtype = {2: np.dtype(np.int16), 4: np.dtype(np.int32)} for f, f2 in zip(all_fields, all_fields[1:]): name = str(f) if name in include_names: yield TypedTraceField(name, f, size2dtype[int(f2) - int(f)]) TRACE_FIELDS = tuple(iter_typed_trace_fields()) TRACE_FIELD_ENUMS = tuple(int(f.enum) for f in TRACE_FIELDS) TRACE_FIELD_NAMES = tuple(f.name for f in TRACE_FIELDS) TRACE_FIELD_DTYPES = tuple(f.dtype for f in TRACE_FIELDS) TRACE_FIELDS_SIZE = sum(dtype.itemsize for dtype in TRACE_FIELD_DTYPES) TRACE_FIELD_FILTERS = ( tiledb.BitWidthReductionFilter(), tiledb.ByteShuffleFilter(), tiledb.LZ4Filter(), ) class ExtendedSegyFile(segyio.SegyFile): @cached_property def trace_size(self) -> int: return len(self._samples) * int(self._dtype.itemsize) @cached_property def fast_headerline(self) -> segyio.line.HeaderLine: return self._header.iline if self.is_inline else self._header.xline @cached_property def fast_lines(self) -> np.ndarray: return self._ilines if self.is_inline else self._xlines