def time_tiledb(dataset, batch_size=1, num_batches=1):
    if os.path.exists(dataset + "_tileDB"):
        ds_tldb = tiledb.open(dataset + "_tileDB", mode="w")
    else:
        y_dim = tiledb.Dim(
            name="y",
            domain=(0, batch_size * num_batches - 1),
            tile=batch_size * num_batches,
            dtype="uint64",
        )
        x_dim = tiledb.Dim(name="x", domain=(0, 784), tile=785, dtype="uint64")
        domain = tiledb.Domain(y_dim, x_dim)
        attr = tiledb.Attr(name="", dtype="int64", var=False)
        schema = tiledb.ArraySchema(
            domain=domain,
            attrs=[attr],
            cell_order="row-major",
            tile_order="row-major",
            sparse=False,
        )
        tiledb.Array.create(dataset + "_tileDB", schema)
        ds_tldb = tiledb.open(dataset + "_tileDB", mode="w")

    assert type(ds_tldb) == tiledb.array.DenseArray
    time_batches(ds_tldb, batch_size, num_batches)
Exemplo n.º 2
0
    def test_quickstart(self):
        with tiledb.open("tiledb://TileDB-Inc/quickstart_dense",
                         ctx=tiledb.cloud.Ctx()) as A:
            print("quickstart_dense:")
            print(A[:])

        with tiledb.open("tiledb://TileDB-Inc/quickstart_sparse",
                         ctx=tiledb.cloud.Ctx()) as A:
            print("quickstart_sparse:")
            print(A[:])

            with self.assertRaises(TypeError):
                A.apply(None, [(0, 1)])

            import numpy

            orig = A[:]
            self.assertEqual(
                A.apply(lambda x: numpy.sum(x["a"]), [(1, 4), (1, 4)]),
                numpy.sum(orig["a"]),
            )

            orig = A.multi_index[[1, slice(2, 4)], [slice(1, 2), 4]]
            self.assertEqual(
                A.apply(lambda x: numpy.sum(x["a"]),
                        [[1, slice(2, 4)], [(1, 2), 4]]),
                numpy.sum(orig["a"]),
            )
Exemplo n.º 3
0
def ingest_in_tiledb(
    tmpdir,
    x_data,
    y_data,
    x_sparse,
    y_sparse,
    batch_size,
    num_attrs,
    pass_attrs,
    buffer_size,
    batch_shuffle,
    within_batch_shuffle,
):
    """Context manager for ingest data into TileDB.

    Yield the keyword arguments for instantiating a TiledbDataset.
    """
    array_uuid = str(uuid.uuid4())
    x_uri = os.path.join(tmpdir, "x_" + array_uuid)
    y_uri = os.path.join(tmpdir, "y_" + array_uuid)
    _ingest_in_tiledb(x_uri, x_data, x_sparse, batch_size, num_attrs)
    _ingest_in_tiledb(y_uri, y_data, y_sparse, batch_size, num_attrs)
    attrs = [f"features_{attr}" for attr in range(num_attrs)] if pass_attrs else []
    with tiledb.open(x_uri) as x_array, tiledb.open(y_uri) as y_array:
        yield dict(
            x_array=x_array,
            y_array=y_array,
            batch_size=batch_size,
            buffer_size=buffer_size,
            batch_shuffle=batch_shuffle,
            within_batch_shuffle=within_batch_shuffle,
            x_attrs=attrs,
            y_attrs=attrs,
        )
Exemplo n.º 4
0
    def test_timestamp(self, runner, temp_rootdir, create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --timestamp <int>
        """
        test_name, expected_output = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")
        uri = os.path.join(temp_rootdir, "test_timestamp.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--sparse",
                "True",
                "--mode",
                "ingest",
                "--timestamp",
                "1",
            ],
        )

        assert result.exit_code == 0

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--sparse",
                "True",
                "--mode",
                "append",
                "--timestamp",
                "2",
            ],
        )

        assert result.exit_code == 0

        with tiledb.open(uri, timestamp=1) as array:
            assert pd.DataFrame.equals(
                array.df[:].loc[:, array.df[:].columns != "__tiledb_rows"],
                expected_output,
            )

        with tiledb.open(uri, timestamp=2) as array:
            assert pd.DataFrame.equals(
                array.df[:].loc[:, array.df[:].columns != "__tiledb_rows"],
                expected_output.append(expected_output, ignore_index=True),
            )
Exemplo n.º 5
0
def open2(
    data_uri: URI, headers_uri: URI, config: Optional[tiledb.Config] = None
) -> Segy:
    ctx = tiledb.Ctx(config)
    data = tiledb.open(str(data_uri), attr="trace", ctx=ctx)
    headers = tiledb.open(str(headers_uri), ctx=ctx)
    if data.schema.domain.has_dim("traces"):
        cls = Segy
    else:
        cls = StructuredSegy
    return cls(data, headers)
Exemplo n.º 6
0
    def test_duplicates(self, runner, temp_rootdir, create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --allows-duplicates (False|True)
        """
        test_name, _ = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")

        uri = os.path.join(temp_rootdir, "test_no_duplicates.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--sparse",
                "True",
                "--allows-duplicates",
                "False",
            ],
        )

        assert result.exit_code == 0

        with tiledb.open(uri) as array:
            assert array.schema.allows_duplicates == False

        uri = os.path.join(temp_rootdir, "test_allows_duplicates.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--sparse",
                "True",
                "--allows-duplicates",
                "True",
            ],
        )

        assert result.exit_code == 0

        with tiledb.open(uri) as array:
            assert array.schema.allows_duplicates == True
Exemplo n.º 7
0
    def load(self, model: Module, optimizer: Optimizer) -> dict:
        """
        Loads a PyTorch model from a TileDB array.
        :param model: Pytorch Module. A defined PyTorch model.
        :param optimizer: PyTorch Optimizer. A defined PyTorch optimizer.
        :return: Dict. A dictionary with attributes other than model or optimizer
        state_dict.
        """

        model_array = tiledb.open(self.uri)
        model_array_results = model_array[:]
        schema = model_array.schema

        model_state_dict = pickle.loads(
            model_array_results["model_state_dict"].item(0))
        optimizer_state_dict = pickle.loads(
            model_array_results["optimizer_state_dict"].item(0))

        # Load model's state and optimizer dictionaries
        model.load_state_dict(model_state_dict)
        optimizer.load_state_dict(optimizer_state_dict)

        # Get the rest of the attributes
        out_dict = {}
        for idx in range(schema.nattr):
            attr_name = schema.attr(idx).name
            if (schema.attr(idx).name != "model_state_dict"
                    and schema.attr(idx).name != "optimizer_state_dict"):
                out_dict[attr_name] = pickle.loads(
                    model_array_results[attr_name].item(0))
        return out_dict
Exemplo n.º 8
0
def nonempty_domain(uri):
    """
    Output the non-empty domain of a TileDB array located at uri.
    """
    with tiledb.open(uri) as array:
        pp = pprint.PrettyPrinter()
        click.echo(pp.pformat(array.nonempty_domain()))
Exemplo n.º 9
0
 def _from_tdb_array(self,
                     array_path,
                     naming_key,
                     array_name=None,
                     to_dask=False,
                     handle_nan=None):
     """Retrieve data and metadata from a specified TileDB array."""
     with tiledb.open(array_path, 'r', ctx=self.ctx) as A:
         metadata = {k: v for k, v in A.meta.items()}
         if array_name is None:
             array_name = metadata[naming_key]
         if to_dask:
             schema = A.schema
             dtype = schema.attr(array_name).dtype
             chunks = [
                 schema.domain.dim(i).tile for i in range(schema.ndim)
             ]
             array_shape = self._array_shape(A.nonempty_domain())
             proxy = TileDBDataProxy(array_shape,
                                     dtype,
                                     array_path,
                                     array_name,
                                     handle_nan=handle_nan,
                                     ctx=self.ctx)
             points = da.from_array(proxy, chunks, name=naming_key)
         else:
             array_inds = self._array_shape(A.nonempty_domain(),
                                            slices=True)
             points = A[array_inds][array_name]
     return metadata, points
Exemplo n.º 10
0
def write_multiattr_array(array_filename,
                          data_vars,
                          start_index=None,
                          scalar=False,
                          ctx=None):
    """Write to each attr in the array."""
    # Determine shape of items to be written.
    zeroth_key = list(data_vars.keys())[0]
    shape = data_vars[
        zeroth_key].shape  # All data vars *must* have the same shape for writing...
    if scalar:
        shape = (1, ) + shape

    # Get write indices.
    if start_index is None:
        start_index = 0
        write_indices = _array_indices(shape, start_index)
    else:
        write_indices = start_index

    # Check for attrs with no data.
    for name, data_var in data_vars.items():
        if data_var is None:
            # Handle missing data for this attr.
            missing_data = np.empty(shape)
            missing_data.fill(np.nan)
            data_vars[name] = missing_data

    # Write netcdf data var contents into array.
    with tiledb.open(array_filename, 'w', ctx=ctx) as A:
        A[write_indices] = {
            name: data_var[...]
            for name, data_var in data_vars.items()
        }
Exemplo n.º 11
0
    def test_coords_filters(self, runner, temp_rootdir,
                            create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --coords-filters <filter name>,<filter name>,...
        """
        test_name, _ = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")
        uri = os.path.join(temp_rootdir, "test_coords_filters.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--coords-filters",
                "GzipFilter=9",
            ],
        )

        print(result.stdout)
        assert result.exit_code == 0

        with tiledb.open(uri) as array:
            assert array.schema.coords_filters.nfilters == 1
            assert array.schema.coords_filters[0] == tiledb.GzipFilter(9)
Exemplo n.º 12
0
    def test_row_start_idx(self, runner, temp_rootdir, create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --row-start-idx <int>
        """
        test_name, _ = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")
        uri = os.path.join(temp_rootdir, "test_row_start_idx.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--sparse",
                "False",
                "--row-start-idx",
                "5",
            ],
        )

        assert result.exit_code == 0

        with tiledb.open(uri) as array:
            assert array.df[:].index.to_numpy()[0] == 5
            assert array.df[:].index.to_numpy()[-1] == 9
Exemplo n.º 13
0
 def _tiledb_array(self, uri: str,
                   schema: tiledb.ArraySchema) -> Iterator[tiledb.Array]:
     tiledb.Array.create(uri, schema)
     with tiledb.open(uri, mode="w") as tdb:
         yield tdb
     tiledb.consolidate(uri, config=self.config)
     tiledb.vacuum(uri, config=self.config)
Exemplo n.º 14
0
    def test_int_dtypes(self, runner, temp_rootdir, sparse, dtype):
        uri = os.path.abspath(
            os.path.join(
                temp_rootdir,
                tempfile.mkdtemp(),
                "test_int_dtypes_"
                f"{'sparse' if sparse else 'dense'}_"
                f"{np.dtype(dtype).name}",
            ))

        dom = tiledb.Domain(tiledb.Dim(domain=(1, 10), dtype=dtype))
        att = tiledb.Attr(dtype=dtype)
        schema = tiledb.ArraySchema(domain=dom, attrs=(att, ), sparse=sparse)
        tiledb.Array.create(uri, schema)

        with tiledb.open(uri, mode="w") as A:
            if sparse:
                A[np.arange(1, 11)] = np.random.randint(10,
                                                        size=10,
                                                        dtype=dtype)
            else:
                A[:] = np.random.randint(10, size=10, dtype=dtype)

        result = runner.invoke(root, ["dump", "array", uri, "5"])
        assert result.exit_code == 0

        result = runner.invoke(root, ["dump", "array", uri, "1:10"])
        assert result.exit_code == 0
Exemplo n.º 15
0
def get_upsampled_indices_chrom(inputs):
    region_start = inputs[0]
    region_end = inputs[1]
    tdb_array_name = inputs[2]
    tdb_ambig_attribute = inputs[3]
    tdb_partition_attribute_for_upsample = inputs[4]
    dataset_indices = inputs[5]
    tdb_partition_thresh_for_upsample = inputs[6]
    print("starting getting indices to upsample in range:" +
          str(region_start) + "-" + str(region_end))
    with tiledb.open(tdb_array_name, 'r',
                     ctx=tiledb.Ctx(get_default_config())) as tdb_array:
        if tdb_ambig_attribute is not None:
            attr_vals = tdb_array.query(attrs=[
                tdb_ambig_attribute, tdb_partition_attribute_for_upsample
            ]).multi_index[region_start:region_end - 1, dataset_indices]
            ambig_attr_vals = np.sum(attr_vals[tdb_ambig_attribute], axis=1)
        else:
            attr_vals = tdb_array.query(
                attrs=[tdb_partition_attribute_for_upsample]).multi_index[
                    region_start:region_end - 1, dataset_indices]
        upsample_vals = np.sum(attr_vals[tdb_partition_attribute_for_upsample],
                               axis=1)
    if tdb_ambig_attribute is not None:
        cur_upsampled_indices = region_start + np.argwhere(
            (upsample_vals >= tdb_partition_thresh_for_upsample)
            & (ambig_attr_vals == 0))
    else:
        cur_upsampled_indices = region_start + np.argwhere(
            upsample_vals >= tdb_partition_thresh_for_upsample)
    print("finished indices to upsample in range:" + str(region_start) + "-" +
          str(region_end))
    return cur_upsampled_indices
Exemplo n.º 16
0
    def read_labels(self, data_adaptor):
        user_id = self.get_user_id()
        if user_id is None:
            return
        dataset_name = data_adaptor.get_location()
        dataset_id = self.db.get_or_create_dataset(dataset_name)

        annotation_object = self.db.query_for_most_recent(
            Annotation, [
                Annotation.user_id == user_id, Annotation.dataset_id
                == dataset_id
            ])
        if annotation_object:
            if annotation_object.tiledb_uri == "":
                # this mean the user has removed all the categories.
                return None
            try:
                df = tiledb.open(annotation_object.tiledb_uri)
            except tiledb.TileDBError:
                # don't crash if the annotations file is missing or can't be read.
                current_app.logger.warning(
                    f"Cannot read annotation file: {annotation_object.tiledb_uri}"
                )
                return None
            pandas_df = self.convert_to_pandas_df(
                df, annotation_object.schema_hints)
            return pandas_df
        else:
            return None
Exemplo n.º 17
0
    def _write_bytes_to_array(self,
                              uri,
                              contents,
                              mimetype=None,
                              format=None,
                              type=None):
        """
        Write given bytes to the array. Will create the array if it does not exist
        :param uri: array to write to
        :param contents: bytes to write
        :param mimetype: mimetype to set in metadata
        :param format: format to set in metadata
        :param type: type to set in metadata
        :return:
        """
        tiledb_uri = self.tiledb_uri_from_path(uri)
        final_array_name = None
        if self._is_new:
            # if not self._array_exists(uri):
            tiledb_uri, final_array_name = self._create_array(tiledb_uri, 5)

        with tiledb.open(tiledb_uri, mode="w", ctx=tiledb.cloud.Ctx()) as A:
            A[range(len(contents))] = {"contents": contents}
            A.meta["file_size"] = len(contents)
            if mimetype is not None:
                A.meta["mimetype"] = mimetype
            if format is not None:
                A.meta["format"] = format
            if type is not None:
                A.meta["type"] = type

        return final_array_name
Exemplo n.º 18
0
    def test_date_spec(self, runner, temp_rootdir, create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --date-spec <column>:<datetime format spec>,...
        """
        test_name, expected_output = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")
        uri = os.path.join(temp_rootdir, "test_date_spec.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from", "csv", input_path, uri, "--date-spec",
                "date:%b/%d/%Y"
            ],
        )

        assert result.exit_code == 0

        with tiledb.open(uri) as array:
            assert pd.DataFrame.equals(
                array.query(["date"]).df[:],
                pd.DataFrame(pd.to_datetime(expected_output["date"])),
            )
    def spatial_index(self, group_name, array_name, spatial_inds):
        """
        Index a specified array in coordinate space rather than in index space.
        TileDB arrays are all described in index space, with named `Dim`s
        describing a `Domain` that encapsulates the array. Earth system data, however,
        typically is described as labelled arrays, with a named coordinate describing
        each dimension of the array.

        Practically, this provides a mapping from spatial indices (the input) to
        index space, which is used to index the array.

        NOTE: only spatial coordinate *values* are supported; datetimes in particular
        are not currently supported.

        """
        array_filepath = self.array_path.construct_path(group_name, array_name)
        array_dims = self._get_dim_coords(array_filepath)

        # Check that all the coords being spatially indexed are in the array's coords.
        coord_names = list(spatial_inds.keys())
        assert list(set(coord_names) & set(array_dims)) == coord_names

        indices = []
        for dim_name in array_dims:
            coord_vals = spatial_inds.get(dim_name, None)
            if coord_vals is None:
                indices.append(slice(None))
            else:
                dim_slice = self._map_coords_inds(group_name, dim_name, coord_vals)
                indices.append(dim_slice)

        with tiledb.open(array_filepath, 'r', ctx=self.ctx) as A:
            subarray = A[tuple(indices)]
        return subarray
Exemplo n.º 20
0
    def test_mode_schema_only(self, runner, temp_rootdir,
                              create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --mode (ingest|schema_only|append)
        """
        test_name, _ = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")
        uri = os.path.join(temp_rootdir, "test_mode_schema_only.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--sparse",
                "True",
                "--mode",
                "schema_only",
            ],
        )

        assert result.exit_code == 0

        with tiledb.open(uri) as array:
            assert array.query(use_arrow=False).df[0].empty
Exemplo n.º 21
0
def uri(temp_rootdir):
    """
    Create a simple dense test array.
    """
    path = os.path.abspath(os.path.join(temp_rootdir, "test_array"))

    ctx = tiledb.default_ctx()
    rows_dim = tiledb.Dim(ctx=ctx, domain=(1, 25), dtype=np.int64)
    cols_dim = tiledb.Dim(ctx=ctx, domain=(1, 12), dtype=np.int64)
    dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx)
    att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64)
    att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64)
    schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att1, att2))

    tiledb.Array.create(path, schema)

    data = np.reshape(np.arange(300), (25, 12))

    for ts in range(1, 4):
        with tiledb.open(path, mode="w", timestamp=ts) as A:
            A[:] = {"a": data, "b": data}

    yield path

    shutil.rmtree(path)
Exemplo n.º 22
0
    def test_datetime_dtype(self, runner, temp_rootdir, dtype):
        uri = os.path.abspath(
            os.path.join(
                temp_rootdir,
                tempfile.mkdtemp(),
                f"test_datetime_dtype_{np.dtype(dtype).name}",
            ))

        dom = tiledb.Domain(
            tiledb.Dim(
                domain=(np.datetime64("1970-01-01"),
                        np.datetime64("1980-01-01")),
                dtype=dtype,
            ))
        att = tiledb.Attr(dtype=dtype)
        schema = tiledb.ArraySchema(domain=dom, attrs=(att, ), sparse=True)
        tiledb.Array.create(uri, schema)

        with tiledb.open(uri, mode="w") as A:
            A[np.arange(1, 11)] = np.random.randint(low=1, high=10, size=10)

        result = runner.invoke(root, ["dump", "array", uri, "'1970-01-04'"])
        assert result.exit_code == 0

        result = runner.invoke(
            root, ["dump", "array", uri, "'1970-01-01':'1980-01-01'"])
        assert result.exit_code == 0
Exemplo n.º 23
0
    def _get_grid_mapping(self, data_array_path):
        """
        Get the grid mapping (Iris coord_system) from the data array metadata.
        Grid mapping is stored as a JSON string in the array meta,
        which is translated by `.grid_mappings.GridMapping`.

        """
        grid_mapping = None
        with tiledb.open(data_array_path, 'r', ctx=self.ctx) as A:
            try:
                grid_mapping_str = A.meta['grid_mapping']
            except KeyError:
                grid_mapping_str = None
        if grid_mapping_str is not None and grid_mapping_str != 'none':
            # Cannot write NoneType into TileDB array meta, so `'none'` is a
            # stand-in that must be caught.
            translator = GridMapping(grid_mapping_str)
            try:
                grid_mapping = translator.get_grid_mapping()
            except Exception as e:
                exception_type = e.__class__.__name__
                warnings.warn(
                    f'Re-raised as warning: {exception_type}: {e}.\nGrid mapping will be None.'
                )
        return grid_mapping
Exemplo n.º 24
0
    def test_header_and_names(self, runner, temp_rootdir,
                              create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --header 0 --names <column name>,...
        """
        test_name, _ = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")
        uri = os.path.join(temp_rootdir, "test_names.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--header",
                "0",
                "--names",
                "d,c,b,a",
            ],
        )

        assert result.exit_code == 0
        with tiledb.open(uri) as array:
            assert array.df[:].columns[0] == "d"
            assert array.df[:].columns[1] == "c"
            assert array.df[:].columns[2] == "b"
            assert array.df[:].columns[3] == "a"
Exemplo n.º 25
0
def _ingest_in_tiledb(
    uri: str, data: np.ndarray, sparse: bool, batch_size: int, num_attrs: int
) -> None:
    dims = [
        tiledb.Dim(
            name=f"dim_{dim}",
            domain=(0, data.shape[dim] - 1),
            tile=np.random.randint(1, data.shape[dim] if dim > 0 else batch_size),
            dtype=np.int32,
        )
        for dim in range(data.ndim)
    ]

    # TileDB schema
    schema = tiledb.ArraySchema(
        domain=tiledb.Domain(*dims),
        sparse=sparse,
        attrs=[
            tiledb.Attr(name=f"features_{attr}", dtype=np.float32)
            for attr in range(num_attrs)
        ],
    )

    # Create the (empty) array on disk.
    tiledb.Array.create(uri, schema)

    # Ingest
    with tiledb.open(uri, "w") as tiledb_array:
        idx = np.nonzero(data) if sparse else slice(None)
        tiledb_array[idx] = {f"features_{attr}": data[idx] for attr in range(num_attrs)}
    def _extract(self, array_name):
        """
        Return the path to a named array, plus paths for all the associated
        dimension arrays.

        Handles multi-attr arrays by scanning all attrs in arrays that match the data
        array name passed to `self` at instantiation.

        """
        # Sanity check the requested array name is in this TileDB.
        assert array_name in self.arrays.keys()
        named_array_path = self.arrays[array_name]

        named_group_path, _ = os.path.split(named_array_path)
        named_group_arrays = self.groups[named_group_path]

        with tiledb.open(named_array_path, 'r', ctx=self.ctx) as A:
            dim_names = A.meta['dimensions'].split(',')

        dim_paths = []
        for dim_name in dim_names:
            for array_path in named_group_arrays:
                array_path = array_path[:-1] if array_path.endswith('/') else array_path
                if array_path.endswith(dim_name):
                    dim_paths.append(array_path)
                    break
        # Confirm we have an array path for each dim_name.
        assert len(dim_paths) == len(dim_names)

        return named_array_path, dim_paths
Exemplo n.º 27
0
    def load(self,
             compile_model: bool = False,
             custom_objects: Optional[dict] = None) -> Model:
        """
        Loads a Tensorflow model from a TileDB array.
        :param compile_model: Boolean. Whether to compile the model after loading or not.
        :param custom_objects: Optional dictionary mapping names (strings) to
        custom classes or functions to be considered during deserialization.
        :return: Model. Tensorflow model.
        """
        model_array = tiledb.open(self.uri)
        model_array_results = model_array[:]
        model_weights = pickle.loads(
            model_array_results["model_weights"].item(0))
        model_config = json.loads(model_array.meta["model_config"])

        architecture = model_config["config"]
        model_class = model_config["class_name"]

        if model_class == "Sequential":
            model = tf.keras.Sequential.from_config(architecture)
        elif model_class == "Functional":
            model = tf.keras.Model.from_config(architecture)
        else:
            raise NotImplementedError(
                "No support for Subclassed models at the moment. Your "
                "model should be either Sequential or Functional.")

        model.set_weights(model_weights)

        if compile_model:
            optimizer_weights = pickle.loads(
                model_array_results["optimizer_weights"].item(0))
            training_config = json.loads(model_array.meta["training_config"])

            # Compile model.
            model.compile(**saving_utils.compile_args_from_training_config(
                training_config, custom_objects))
            saving_utils.try_build_compiled_arguments(model)

            # Set optimizer weights.
            if optimizer_weights:
                try:
                    model.optimizer._create_all_weights(
                        model.trainable_variables)
                except (NotImplementedError, AttributeError):
                    logging.warning(
                        "Error when creating the weights of optimizer {}, making it "
                        "impossible to restore the saved optimizer state. As a result, "
                        "your model is starting with a freshly initialized optimizer."
                    )

                try:
                    model.optimizer.set_weights(optimizer_weights)
                except ValueError:
                    logging.warning("Error in loading the saved optimizer "
                                    "state. As a result, your model is "
                                    "starting with a freshly initialized "
                                    "optimizer.")
        return model
Exemplo n.º 28
0
def time_tiledb(dataset, batch_size=1):
    ds = hub.Dataset(dataset)
    if os.path.exists(dataset.split("/")[1] + "_tileDB"):
        ds_tldb = tiledb.open(dataset.split("/")[1] + "_tileDB")
    else:
        if not os.path.exists(dataset.split("/")[1] + "_tileDB"):
            os.makedirs(dataset.split("/")[1] + "_tileDB")
        ds_numpy = np.concatenate(
            (
                ds["image"].compute().reshape(ds.shape[0], -1),
                ds["label"].compute().reshape(ds.shape[0], -1),
            ),
            axis=1,
        )
        ds_tldb = tiledb.from_numpy(
            dataset.split("/")[1] + "_tileDB", ds_numpy)

    assert type(ds_tldb) == tiledb.array.DenseArray

    with Timer("Time"):
        counter = 0
        t0 = time()
        for batch in range(ds.shape[0] // batch_size):
            x, y = (
                ds_tldb[batch * batch_size:(batch + 1) * batch_size, :-1],
                ds_tldb[batch * batch_size:(batch + 1) * batch_size, -1],
            )
            counter += 1
            t1 = time()
            print("Batch", counter, f"dt: {t1 - t0}")
            t0 = t1
Exemplo n.º 29
0
def metadata(uri):
    """
    Output the metadata of a TileDB array located at uri.
    """
    with tiledb.open(uri) as array:
        pp = pprint.PrettyPrinter()
        click.echo(pp.pformat(array.meta.items()))
Exemplo n.º 30
0
    def test_quickstart_sql_async(self):
        with tiledb.open("tiledb://TileDB-Inc/quickstart_sparse",
                         ctx=tiledb.cloud.Ctx()) as A:
            print("quickstart_sparse:")
            print(A[:])

            with self.assertRaises(TypeError):
                A.apply(None, [(0, 1)]).get()

            import numpy

            orig = A[:]
            task_name = "test_quickstart_sql_async"
            self.assertEqual(
                int(
                    tiledb.cloud.sql.exec_async(
                        "select sum(a) as sum from `tiledb://TileDB-Inc/quickstart_sparse`",
                        task_name=task_name,
                    ).get()["sum"]),
                numpy.sum(orig["a"]),
            )

            # Validate task name was set
            self.assertEqual(tiledb.cloud.last_sql_task().name, task_name)

            orig = A.multi_index[[1, slice(2, 4)], [slice(1, 2), 4]]
            self.assertEqual(
                int(
                    tiledb.cloud.sql.exec_async(
                        "select sum(a) as sum from `tiledb://TileDB-Inc/quickstart_sparse` WHERE (`rows`, `cols`) in ((1,1), (2,4))"
                    ).get()["sum"]),
                numpy.sum(orig["a"]),
            )
Exemplo n.º 31
0
def from_tiledb(uri, attribute=None, chunks=None,
                storage_options=None, **kwargs):
    """Load array from the TileDB storage format

    See https://docs.tiledb.io for more information about TileDB.

    Parameters
    ----------
    uri: TileDB array or str
        Location to save the data
    attribute: str or None
        Attribute selection (single-attribute view on multi-attribute array)


    Returns
    -------

    A Dask Array

    Examples
    --------

    >>> # create a tiledb array
    >>> import tiledb, numpy as np, tempfile  # doctest: +SKIP
    >>> uri = tempfile.NamedTemporaryFile().name  # doctest: +SKIP
    >>> tiledb.from_numpy(uri, np.arange(0,9).reshape(3,3))  # doctest: +SKIP
    <tiledb.libtiledb.DenseArray object at 0x...>
    >>> # read back the array
    >>> import dask.array as da  # doctest: +SKIP
    >>> tdb_ar = da.from_tiledb(uri)  # doctest: +SKIP
    >>> tdb_ar.shape  # doctest: +SKIP
    (3, 3)
    >>> tdb_ar.mean().compute()  # doctest: +SKIP
    4.0
    """
    import tiledb
    tiledb_config = storage_options or dict()
    key = tiledb_config.pop('key', None)

    if isinstance(uri, tiledb.Array):
        tdb = uri
    else:
        tdb = tiledb.open(uri, attr=attribute, config=tiledb_config, key=key)

    if tdb.schema.sparse:
        raise ValueError("Sparse TileDB arrays are not supported")

    if not attribute:
        if tdb.schema.nattr > 1:
            raise TypeError("keyword 'attribute' must be provided"
                            "when loading a multi-attribute TileDB array")
        else:
            attribute = tdb.schema.attr(0).name

    if tdb.iswritable:
        raise ValueError("TileDB array must be open for reading")

    chunks = chunks or _tiledb_to_chunks(tdb)

    assert(len(chunks) == tdb.schema.ndim)

    return core.from_array(tdb, chunks, name='tiledb-%s' % uri)