def time_tiledb(dataset, batch_size=1, num_batches=1): if os.path.exists(dataset + "_tileDB"): ds_tldb = tiledb.open(dataset + "_tileDB", mode="w") else: y_dim = tiledb.Dim( name="y", domain=(0, batch_size * num_batches - 1), tile=batch_size * num_batches, dtype="uint64", ) x_dim = tiledb.Dim(name="x", domain=(0, 784), tile=785, dtype="uint64") domain = tiledb.Domain(y_dim, x_dim) attr = tiledb.Attr(name="", dtype="int64", var=False) schema = tiledb.ArraySchema( domain=domain, attrs=[attr], cell_order="row-major", tile_order="row-major", sparse=False, ) tiledb.Array.create(dataset + "_tileDB", schema) ds_tldb = tiledb.open(dataset + "_tileDB", mode="w") assert type(ds_tldb) == tiledb.array.DenseArray time_batches(ds_tldb, batch_size, num_batches)
def test_quickstart(self): with tiledb.open("tiledb://TileDB-Inc/quickstart_dense", ctx=tiledb.cloud.Ctx()) as A: print("quickstart_dense:") print(A[:]) with tiledb.open("tiledb://TileDB-Inc/quickstart_sparse", ctx=tiledb.cloud.Ctx()) as A: print("quickstart_sparse:") print(A[:]) with self.assertRaises(TypeError): A.apply(None, [(0, 1)]) import numpy orig = A[:] self.assertEqual( A.apply(lambda x: numpy.sum(x["a"]), [(1, 4), (1, 4)]), numpy.sum(orig["a"]), ) orig = A.multi_index[[1, slice(2, 4)], [slice(1, 2), 4]] self.assertEqual( A.apply(lambda x: numpy.sum(x["a"]), [[1, slice(2, 4)], [(1, 2), 4]]), numpy.sum(orig["a"]), )
def ingest_in_tiledb( tmpdir, x_data, y_data, x_sparse, y_sparse, batch_size, num_attrs, pass_attrs, buffer_size, batch_shuffle, within_batch_shuffle, ): """Context manager for ingest data into TileDB. Yield the keyword arguments for instantiating a TiledbDataset. """ array_uuid = str(uuid.uuid4()) x_uri = os.path.join(tmpdir, "x_" + array_uuid) y_uri = os.path.join(tmpdir, "y_" + array_uuid) _ingest_in_tiledb(x_uri, x_data, x_sparse, batch_size, num_attrs) _ingest_in_tiledb(y_uri, y_data, y_sparse, batch_size, num_attrs) attrs = [f"features_{attr}" for attr in range(num_attrs)] if pass_attrs else [] with tiledb.open(x_uri) as x_array, tiledb.open(y_uri) as y_array: yield dict( x_array=x_array, y_array=y_array, batch_size=batch_size, buffer_size=buffer_size, batch_shuffle=batch_shuffle, within_batch_shuffle=within_batch_shuffle, x_attrs=attrs, y_attrs=attrs, )
def test_timestamp(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --timestamp <int> """ test_name, expected_output = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_timestamp.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--sparse", "True", "--mode", "ingest", "--timestamp", "1", ], ) assert result.exit_code == 0 result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--sparse", "True", "--mode", "append", "--timestamp", "2", ], ) assert result.exit_code == 0 with tiledb.open(uri, timestamp=1) as array: assert pd.DataFrame.equals( array.df[:].loc[:, array.df[:].columns != "__tiledb_rows"], expected_output, ) with tiledb.open(uri, timestamp=2) as array: assert pd.DataFrame.equals( array.df[:].loc[:, array.df[:].columns != "__tiledb_rows"], expected_output.append(expected_output, ignore_index=True), )
def open2( data_uri: URI, headers_uri: URI, config: Optional[tiledb.Config] = None ) -> Segy: ctx = tiledb.Ctx(config) data = tiledb.open(str(data_uri), attr="trace", ctx=ctx) headers = tiledb.open(str(headers_uri), ctx=ctx) if data.schema.domain.has_dim("traces"): cls = Segy else: cls = StructuredSegy return cls(data, headers)
def test_duplicates(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --allows-duplicates (False|True) """ test_name, _ = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_no_duplicates.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--sparse", "True", "--allows-duplicates", "False", ], ) assert result.exit_code == 0 with tiledb.open(uri) as array: assert array.schema.allows_duplicates == False uri = os.path.join(temp_rootdir, "test_allows_duplicates.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--sparse", "True", "--allows-duplicates", "True", ], ) assert result.exit_code == 0 with tiledb.open(uri) as array: assert array.schema.allows_duplicates == True
def load(self, model: Module, optimizer: Optimizer) -> dict: """ Loads a PyTorch model from a TileDB array. :param model: Pytorch Module. A defined PyTorch model. :param optimizer: PyTorch Optimizer. A defined PyTorch optimizer. :return: Dict. A dictionary with attributes other than model or optimizer state_dict. """ model_array = tiledb.open(self.uri) model_array_results = model_array[:] schema = model_array.schema model_state_dict = pickle.loads( model_array_results["model_state_dict"].item(0)) optimizer_state_dict = pickle.loads( model_array_results["optimizer_state_dict"].item(0)) # Load model's state and optimizer dictionaries model.load_state_dict(model_state_dict) optimizer.load_state_dict(optimizer_state_dict) # Get the rest of the attributes out_dict = {} for idx in range(schema.nattr): attr_name = schema.attr(idx).name if (schema.attr(idx).name != "model_state_dict" and schema.attr(idx).name != "optimizer_state_dict"): out_dict[attr_name] = pickle.loads( model_array_results[attr_name].item(0)) return out_dict
def nonempty_domain(uri): """ Output the non-empty domain of a TileDB array located at uri. """ with tiledb.open(uri) as array: pp = pprint.PrettyPrinter() click.echo(pp.pformat(array.nonempty_domain()))
def _from_tdb_array(self, array_path, naming_key, array_name=None, to_dask=False, handle_nan=None): """Retrieve data and metadata from a specified TileDB array.""" with tiledb.open(array_path, 'r', ctx=self.ctx) as A: metadata = {k: v for k, v in A.meta.items()} if array_name is None: array_name = metadata[naming_key] if to_dask: schema = A.schema dtype = schema.attr(array_name).dtype chunks = [ schema.domain.dim(i).tile for i in range(schema.ndim) ] array_shape = self._array_shape(A.nonempty_domain()) proxy = TileDBDataProxy(array_shape, dtype, array_path, array_name, handle_nan=handle_nan, ctx=self.ctx) points = da.from_array(proxy, chunks, name=naming_key) else: array_inds = self._array_shape(A.nonempty_domain(), slices=True) points = A[array_inds][array_name] return metadata, points
def write_multiattr_array(array_filename, data_vars, start_index=None, scalar=False, ctx=None): """Write to each attr in the array.""" # Determine shape of items to be written. zeroth_key = list(data_vars.keys())[0] shape = data_vars[ zeroth_key].shape # All data vars *must* have the same shape for writing... if scalar: shape = (1, ) + shape # Get write indices. if start_index is None: start_index = 0 write_indices = _array_indices(shape, start_index) else: write_indices = start_index # Check for attrs with no data. for name, data_var in data_vars.items(): if data_var is None: # Handle missing data for this attr. missing_data = np.empty(shape) missing_data.fill(np.nan) data_vars[name] = missing_data # Write netcdf data var contents into array. with tiledb.open(array_filename, 'w', ctx=ctx) as A: A[write_indices] = { name: data_var[...] for name, data_var in data_vars.items() }
def test_coords_filters(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --coords-filters <filter name>,<filter name>,... """ test_name, _ = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_coords_filters.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--coords-filters", "GzipFilter=9", ], ) print(result.stdout) assert result.exit_code == 0 with tiledb.open(uri) as array: assert array.schema.coords_filters.nfilters == 1 assert array.schema.coords_filters[0] == tiledb.GzipFilter(9)
def test_row_start_idx(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --row-start-idx <int> """ test_name, _ = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_row_start_idx.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--sparse", "False", "--row-start-idx", "5", ], ) assert result.exit_code == 0 with tiledb.open(uri) as array: assert array.df[:].index.to_numpy()[0] == 5 assert array.df[:].index.to_numpy()[-1] == 9
def _tiledb_array(self, uri: str, schema: tiledb.ArraySchema) -> Iterator[tiledb.Array]: tiledb.Array.create(uri, schema) with tiledb.open(uri, mode="w") as tdb: yield tdb tiledb.consolidate(uri, config=self.config) tiledb.vacuum(uri, config=self.config)
def test_int_dtypes(self, runner, temp_rootdir, sparse, dtype): uri = os.path.abspath( os.path.join( temp_rootdir, tempfile.mkdtemp(), "test_int_dtypes_" f"{'sparse' if sparse else 'dense'}_" f"{np.dtype(dtype).name}", )) dom = tiledb.Domain(tiledb.Dim(domain=(1, 10), dtype=dtype)) att = tiledb.Attr(dtype=dtype) schema = tiledb.ArraySchema(domain=dom, attrs=(att, ), sparse=sparse) tiledb.Array.create(uri, schema) with tiledb.open(uri, mode="w") as A: if sparse: A[np.arange(1, 11)] = np.random.randint(10, size=10, dtype=dtype) else: A[:] = np.random.randint(10, size=10, dtype=dtype) result = runner.invoke(root, ["dump", "array", uri, "5"]) assert result.exit_code == 0 result = runner.invoke(root, ["dump", "array", uri, "1:10"]) assert result.exit_code == 0
def get_upsampled_indices_chrom(inputs): region_start = inputs[0] region_end = inputs[1] tdb_array_name = inputs[2] tdb_ambig_attribute = inputs[3] tdb_partition_attribute_for_upsample = inputs[4] dataset_indices = inputs[5] tdb_partition_thresh_for_upsample = inputs[6] print("starting getting indices to upsample in range:" + str(region_start) + "-" + str(region_end)) with tiledb.open(tdb_array_name, 'r', ctx=tiledb.Ctx(get_default_config())) as tdb_array: if tdb_ambig_attribute is not None: attr_vals = tdb_array.query(attrs=[ tdb_ambig_attribute, tdb_partition_attribute_for_upsample ]).multi_index[region_start:region_end - 1, dataset_indices] ambig_attr_vals = np.sum(attr_vals[tdb_ambig_attribute], axis=1) else: attr_vals = tdb_array.query( attrs=[tdb_partition_attribute_for_upsample]).multi_index[ region_start:region_end - 1, dataset_indices] upsample_vals = np.sum(attr_vals[tdb_partition_attribute_for_upsample], axis=1) if tdb_ambig_attribute is not None: cur_upsampled_indices = region_start + np.argwhere( (upsample_vals >= tdb_partition_thresh_for_upsample) & (ambig_attr_vals == 0)) else: cur_upsampled_indices = region_start + np.argwhere( upsample_vals >= tdb_partition_thresh_for_upsample) print("finished indices to upsample in range:" + str(region_start) + "-" + str(region_end)) return cur_upsampled_indices
def read_labels(self, data_adaptor): user_id = self.get_user_id() if user_id is None: return dataset_name = data_adaptor.get_location() dataset_id = self.db.get_or_create_dataset(dataset_name) annotation_object = self.db.query_for_most_recent( Annotation, [ Annotation.user_id == user_id, Annotation.dataset_id == dataset_id ]) if annotation_object: if annotation_object.tiledb_uri == "": # this mean the user has removed all the categories. return None try: df = tiledb.open(annotation_object.tiledb_uri) except tiledb.TileDBError: # don't crash if the annotations file is missing or can't be read. current_app.logger.warning( f"Cannot read annotation file: {annotation_object.tiledb_uri}" ) return None pandas_df = self.convert_to_pandas_df( df, annotation_object.schema_hints) return pandas_df else: return None
def _write_bytes_to_array(self, uri, contents, mimetype=None, format=None, type=None): """ Write given bytes to the array. Will create the array if it does not exist :param uri: array to write to :param contents: bytes to write :param mimetype: mimetype to set in metadata :param format: format to set in metadata :param type: type to set in metadata :return: """ tiledb_uri = self.tiledb_uri_from_path(uri) final_array_name = None if self._is_new: # if not self._array_exists(uri): tiledb_uri, final_array_name = self._create_array(tiledb_uri, 5) with tiledb.open(tiledb_uri, mode="w", ctx=tiledb.cloud.Ctx()) as A: A[range(len(contents))] = {"contents": contents} A.meta["file_size"] = len(contents) if mimetype is not None: A.meta["mimetype"] = mimetype if format is not None: A.meta["format"] = format if type is not None: A.meta["type"] = type return final_array_name
def test_date_spec(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --date-spec <column>:<datetime format spec>,... """ test_name, expected_output = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_date_spec.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--date-spec", "date:%b/%d/%Y" ], ) assert result.exit_code == 0 with tiledb.open(uri) as array: assert pd.DataFrame.equals( array.query(["date"]).df[:], pd.DataFrame(pd.to_datetime(expected_output["date"])), )
def spatial_index(self, group_name, array_name, spatial_inds): """ Index a specified array in coordinate space rather than in index space. TileDB arrays are all described in index space, with named `Dim`s describing a `Domain` that encapsulates the array. Earth system data, however, typically is described as labelled arrays, with a named coordinate describing each dimension of the array. Practically, this provides a mapping from spatial indices (the input) to index space, which is used to index the array. NOTE: only spatial coordinate *values* are supported; datetimes in particular are not currently supported. """ array_filepath = self.array_path.construct_path(group_name, array_name) array_dims = self._get_dim_coords(array_filepath) # Check that all the coords being spatially indexed are in the array's coords. coord_names = list(spatial_inds.keys()) assert list(set(coord_names) & set(array_dims)) == coord_names indices = [] for dim_name in array_dims: coord_vals = spatial_inds.get(dim_name, None) if coord_vals is None: indices.append(slice(None)) else: dim_slice = self._map_coords_inds(group_name, dim_name, coord_vals) indices.append(dim_slice) with tiledb.open(array_filepath, 'r', ctx=self.ctx) as A: subarray = A[tuple(indices)] return subarray
def test_mode_schema_only(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --mode (ingest|schema_only|append) """ test_name, _ = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_mode_schema_only.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--sparse", "True", "--mode", "schema_only", ], ) assert result.exit_code == 0 with tiledb.open(uri) as array: assert array.query(use_arrow=False).df[0].empty
def uri(temp_rootdir): """ Create a simple dense test array. """ path = os.path.abspath(os.path.join(temp_rootdir, "test_array")) ctx = tiledb.default_ctx() rows_dim = tiledb.Dim(ctx=ctx, domain=(1, 25), dtype=np.int64) cols_dim = tiledb.Dim(ctx=ctx, domain=(1, 12), dtype=np.int64) dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx) att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64) att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64) schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att1, att2)) tiledb.Array.create(path, schema) data = np.reshape(np.arange(300), (25, 12)) for ts in range(1, 4): with tiledb.open(path, mode="w", timestamp=ts) as A: A[:] = {"a": data, "b": data} yield path shutil.rmtree(path)
def test_datetime_dtype(self, runner, temp_rootdir, dtype): uri = os.path.abspath( os.path.join( temp_rootdir, tempfile.mkdtemp(), f"test_datetime_dtype_{np.dtype(dtype).name}", )) dom = tiledb.Domain( tiledb.Dim( domain=(np.datetime64("1970-01-01"), np.datetime64("1980-01-01")), dtype=dtype, )) att = tiledb.Attr(dtype=dtype) schema = tiledb.ArraySchema(domain=dom, attrs=(att, ), sparse=True) tiledb.Array.create(uri, schema) with tiledb.open(uri, mode="w") as A: A[np.arange(1, 11)] = np.random.randint(low=1, high=10, size=10) result = runner.invoke(root, ["dump", "array", uri, "'1970-01-04'"]) assert result.exit_code == 0 result = runner.invoke( root, ["dump", "array", uri, "'1970-01-01':'1980-01-01'"]) assert result.exit_code == 0
def _get_grid_mapping(self, data_array_path): """ Get the grid mapping (Iris coord_system) from the data array metadata. Grid mapping is stored as a JSON string in the array meta, which is translated by `.grid_mappings.GridMapping`. """ grid_mapping = None with tiledb.open(data_array_path, 'r', ctx=self.ctx) as A: try: grid_mapping_str = A.meta['grid_mapping'] except KeyError: grid_mapping_str = None if grid_mapping_str is not None and grid_mapping_str != 'none': # Cannot write NoneType into TileDB array meta, so `'none'` is a # stand-in that must be caught. translator = GridMapping(grid_mapping_str) try: grid_mapping = translator.get_grid_mapping() except Exception as e: exception_type = e.__class__.__name__ warnings.warn( f'Re-raised as warning: {exception_type}: {e}.\nGrid mapping will be None.' ) return grid_mapping
def test_header_and_names(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --header 0 --names <column name>,... """ test_name, _ = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_names.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--header", "0", "--names", "d,c,b,a", ], ) assert result.exit_code == 0 with tiledb.open(uri) as array: assert array.df[:].columns[0] == "d" assert array.df[:].columns[1] == "c" assert array.df[:].columns[2] == "b" assert array.df[:].columns[3] == "a"
def _ingest_in_tiledb( uri: str, data: np.ndarray, sparse: bool, batch_size: int, num_attrs: int ) -> None: dims = [ tiledb.Dim( name=f"dim_{dim}", domain=(0, data.shape[dim] - 1), tile=np.random.randint(1, data.shape[dim] if dim > 0 else batch_size), dtype=np.int32, ) for dim in range(data.ndim) ] # TileDB schema schema = tiledb.ArraySchema( domain=tiledb.Domain(*dims), sparse=sparse, attrs=[ tiledb.Attr(name=f"features_{attr}", dtype=np.float32) for attr in range(num_attrs) ], ) # Create the (empty) array on disk. tiledb.Array.create(uri, schema) # Ingest with tiledb.open(uri, "w") as tiledb_array: idx = np.nonzero(data) if sparse else slice(None) tiledb_array[idx] = {f"features_{attr}": data[idx] for attr in range(num_attrs)}
def _extract(self, array_name): """ Return the path to a named array, plus paths for all the associated dimension arrays. Handles multi-attr arrays by scanning all attrs in arrays that match the data array name passed to `self` at instantiation. """ # Sanity check the requested array name is in this TileDB. assert array_name in self.arrays.keys() named_array_path = self.arrays[array_name] named_group_path, _ = os.path.split(named_array_path) named_group_arrays = self.groups[named_group_path] with tiledb.open(named_array_path, 'r', ctx=self.ctx) as A: dim_names = A.meta['dimensions'].split(',') dim_paths = [] for dim_name in dim_names: for array_path in named_group_arrays: array_path = array_path[:-1] if array_path.endswith('/') else array_path if array_path.endswith(dim_name): dim_paths.append(array_path) break # Confirm we have an array path for each dim_name. assert len(dim_paths) == len(dim_names) return named_array_path, dim_paths
def load(self, compile_model: bool = False, custom_objects: Optional[dict] = None) -> Model: """ Loads a Tensorflow model from a TileDB array. :param compile_model: Boolean. Whether to compile the model after loading or not. :param custom_objects: Optional dictionary mapping names (strings) to custom classes or functions to be considered during deserialization. :return: Model. Tensorflow model. """ model_array = tiledb.open(self.uri) model_array_results = model_array[:] model_weights = pickle.loads( model_array_results["model_weights"].item(0)) model_config = json.loads(model_array.meta["model_config"]) architecture = model_config["config"] model_class = model_config["class_name"] if model_class == "Sequential": model = tf.keras.Sequential.from_config(architecture) elif model_class == "Functional": model = tf.keras.Model.from_config(architecture) else: raise NotImplementedError( "No support for Subclassed models at the moment. Your " "model should be either Sequential or Functional.") model.set_weights(model_weights) if compile_model: optimizer_weights = pickle.loads( model_array_results["optimizer_weights"].item(0)) training_config = json.loads(model_array.meta["training_config"]) # Compile model. model.compile(**saving_utils.compile_args_from_training_config( training_config, custom_objects)) saving_utils.try_build_compiled_arguments(model) # Set optimizer weights. if optimizer_weights: try: model.optimizer._create_all_weights( model.trainable_variables) except (NotImplementedError, AttributeError): logging.warning( "Error when creating the weights of optimizer {}, making it " "impossible to restore the saved optimizer state. As a result, " "your model is starting with a freshly initialized optimizer." ) try: model.optimizer.set_weights(optimizer_weights) except ValueError: logging.warning("Error in loading the saved optimizer " "state. As a result, your model is " "starting with a freshly initialized " "optimizer.") return model
def time_tiledb(dataset, batch_size=1): ds = hub.Dataset(dataset) if os.path.exists(dataset.split("/")[1] + "_tileDB"): ds_tldb = tiledb.open(dataset.split("/")[1] + "_tileDB") else: if not os.path.exists(dataset.split("/")[1] + "_tileDB"): os.makedirs(dataset.split("/")[1] + "_tileDB") ds_numpy = np.concatenate( ( ds["image"].compute().reshape(ds.shape[0], -1), ds["label"].compute().reshape(ds.shape[0], -1), ), axis=1, ) ds_tldb = tiledb.from_numpy( dataset.split("/")[1] + "_tileDB", ds_numpy) assert type(ds_tldb) == tiledb.array.DenseArray with Timer("Time"): counter = 0 t0 = time() for batch in range(ds.shape[0] // batch_size): x, y = ( ds_tldb[batch * batch_size:(batch + 1) * batch_size, :-1], ds_tldb[batch * batch_size:(batch + 1) * batch_size, -1], ) counter += 1 t1 = time() print("Batch", counter, f"dt: {t1 - t0}") t0 = t1
def metadata(uri): """ Output the metadata of a TileDB array located at uri. """ with tiledb.open(uri) as array: pp = pprint.PrettyPrinter() click.echo(pp.pformat(array.meta.items()))
def test_quickstart_sql_async(self): with tiledb.open("tiledb://TileDB-Inc/quickstart_sparse", ctx=tiledb.cloud.Ctx()) as A: print("quickstart_sparse:") print(A[:]) with self.assertRaises(TypeError): A.apply(None, [(0, 1)]).get() import numpy orig = A[:] task_name = "test_quickstart_sql_async" self.assertEqual( int( tiledb.cloud.sql.exec_async( "select sum(a) as sum from `tiledb://TileDB-Inc/quickstart_sparse`", task_name=task_name, ).get()["sum"]), numpy.sum(orig["a"]), ) # Validate task name was set self.assertEqual(tiledb.cloud.last_sql_task().name, task_name) orig = A.multi_index[[1, slice(2, 4)], [slice(1, 2), 4]] self.assertEqual( int( tiledb.cloud.sql.exec_async( "select sum(a) as sum from `tiledb://TileDB-Inc/quickstart_sparse` WHERE (`rows`, `cols`) in ((1,1), (2,4))" ).get()["sum"]), numpy.sum(orig["a"]), )
def from_tiledb(uri, attribute=None, chunks=None, storage_options=None, **kwargs): """Load array from the TileDB storage format See https://docs.tiledb.io for more information about TileDB. Parameters ---------- uri: TileDB array or str Location to save the data attribute: str or None Attribute selection (single-attribute view on multi-attribute array) Returns ------- A Dask Array Examples -------- >>> # create a tiledb array >>> import tiledb, numpy as np, tempfile # doctest: +SKIP >>> uri = tempfile.NamedTemporaryFile().name # doctest: +SKIP >>> tiledb.from_numpy(uri, np.arange(0,9).reshape(3,3)) # doctest: +SKIP <tiledb.libtiledb.DenseArray object at 0x...> >>> # read back the array >>> import dask.array as da # doctest: +SKIP >>> tdb_ar = da.from_tiledb(uri) # doctest: +SKIP >>> tdb_ar.shape # doctest: +SKIP (3, 3) >>> tdb_ar.mean().compute() # doctest: +SKIP 4.0 """ import tiledb tiledb_config = storage_options or dict() key = tiledb_config.pop('key', None) if isinstance(uri, tiledb.Array): tdb = uri else: tdb = tiledb.open(uri, attr=attribute, config=tiledb_config, key=key) if tdb.schema.sparse: raise ValueError("Sparse TileDB arrays are not supported") if not attribute: if tdb.schema.nattr > 1: raise TypeError("keyword 'attribute' must be provided" "when loading a multi-attribute TileDB array") else: attribute = tdb.schema.attr(0).name if tdb.iswritable: raise ValueError("TileDB array must be open for reading") chunks = chunks or _tiledb_to_chunks(tdb) assert(len(chunks) == tdb.schema.ndim) return core.from_array(tdb, chunks, name='tiledb-%s' % uri)