Пример #1
0
def create_test_array_sparse_25x12_mult(temp_rootdir):
    """
    Create a simple sparse test array.
    """
    path = os.path.abspath(os.path.join(temp_rootdir, "sparse_25x12_mult"))

    ctx = tiledb.default_ctx()
    rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64)
    cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64)
    dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx)
    att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64)
    att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64)
    schema = tiledb.ArraySchema(ctx=ctx, sparse=True, domain=dom, attrs=(att1, att2))

    tiledb.SparseArray.create(path, schema)

    coords = np.array(list(itertools.product(np.arange(1, 26), np.arange(1, 13))))
    rows = coords[:, 0]
    cols = coords[:, 1]
    data = np.arange(300)

    with tiledb.SparseArray(path, mode="w", timestamp=1) as A:
        A[rows, cols] = {"a": data, "b": data}

    with tiledb.SparseArray(path, mode="w", timestamp=2) as A:
        A[rows, cols] = {"a": data / 2, "b": data * 2}
Пример #2
0
def test_tiledb_test():
    import tiledb

    n = 1000
    m = 1000
    num_vals = 1000

    n_idxs = np.sort(np.random.choice(n, num_vals, replace=False))
    m_idxs = np.sort(np.random.choice(m, num_vals, replace=False))
    values = np.random.randint(0, 100, num_vals, np.uint8)

    ctx = tiledb.Ctx()

    n_tile_extent = min(100, n)

    d1 = tiledb.Dim("ndom",
                    domain=(0, n - 1),
                    tile=n_tile_extent,
                    dtype="uint32",
                    ctx=ctx)
    d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx)

    domain = tiledb.Domain(d1, d2, ctx=ctx)

    v = tiledb.Attr(
        "v",
        filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]),
        dtype="uint8",
        ctx=ctx,
    )

    schema = tiledb.ArraySchema(
        domain=domain,
        attrs=(v, ),
        capacity=10000,
        cell_order="row-major",
        tile_order="row-major",
        sparse=True,
        ctx=ctx,
    )

    with tempfile.TemporaryDirectory() as tdir:

        path = os.path.join(tdir, "arr.tiledb")

        tiledb.SparseArray.create(path, schema)

        with tiledb.SparseArray(path, mode="w", ctx=ctx) as A:
            A[n_idxs, m_idxs] = values

        ctx2 = tiledb.Ctx()

        s = tiledb.SparseArray(path, mode="r", ctx=ctx2)
        vs1 = s[1:10, 1:50]

        _ = s[:, :]
        vs2 = s[1:10, 1:50]

        assert vs1["v"].shape[0] == vs2["v"].shape[0]
Пример #3
0
def test_ingest_csv_sparse_array_apppend_header_mismatch(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    """
    Create a sparse array from a CSV file using ingest_csv() in the default
    ingest mode and then append additional data to it using the append mode.
    The appended data contains header names that do not match the data in the
    sparse array and must be renamed.
    """
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment_sparse1"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        mode="ingest",
        full_domain=True,
        index_col=("x"),
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.SparseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])
        number_of_rows = data.shape[0]
        assert number_of_rows == 20

    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment_sparse2_mismatch"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        mode="append",
        full_domain=True,
        index_col=("x"),
        header=0,
        names=["x", "c", "b", "a"],
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.SparseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        for col, attribute in enumerate(("a", "b", "c"), 1):
            assert_array_equal(
                data[attribute],
                np.array([row * 10 + col for row in range(1, 21)] * 2),
            )
Пример #4
0
def main():
    ctx = tiledb.Ctx()

    # Create dimensions
    d1 = tiledb.Dim(ctx, "d1", domain=(1, 4), tile=2, dtype="uint64")
    d2 = tiledb.Dim(ctx, "d2", domain=(1, 4), tile=2, dtype="uint64")

    # Create domain
    domain = tiledb.Domain(ctx, d1, d2)

    # Create attributes
    a1 = tiledb.Attr(ctx, "a1", compressor=('blosc-lz', -1), dtype="int32")
    a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="S10")
    a3 = tiledb.Attr(ctx,
                     "a3",
                     compressor=('zstd', -1),
                     dtype='float32,float32')

    # Create sparse array
    tiledb.SparseArray(ctx,
                       "my_sparse_array",
                       domain=domain,
                       attrs=(a1, a2, a3),
                       capacity=2,
                       cell_order='row-major',
                       tile_order='row-major')
Пример #5
0
def test_ingest_csv_sparse_array(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    """
    Create a sparse array from a CSV file using ingest_csv().
    """
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        name=udf_uri,  # unittest/test_ingest_csv --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.SparseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        for col, attribute in enumerate(("a", "b", "c"), 1):
            assert_array_equal(
                data[attribute],
                np.array([row * 10 + col for row in range(1, 21)]),
            )
Пример #6
0
def test_store_tiledb_execution(setup):
    ctx = tiledb.Ctx()

    tempdir = tempfile.mkdtemp()
    try:
        # store TileDB dense array
        expected = np.random.rand(8, 4, 3)
        a = tensor(expected, chunk_size=(3, 3, 2))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(expected, arr.read_direct())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store tensor with 1 chunk to TileDB dense array
        a = arange(12)
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(np.arange(12), arr.read_direct())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store 2-d TileDB sparse array
        expected = sps.random(8, 7, density=0.1)
        a = tensor(expected, chunk_size=(3, 5))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr:
            data = arr[:, :]
            coords = data['coords']
            value = data[arr.attr(0).name]
            ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim))
            result = sps.coo_matrix((value, ij), shape=arr.shape)

            np.testing.assert_allclose(expected.toarray(), result.toarray())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store TileDB dense array
        expected = np.asfortranarray(np.random.rand(8, 4, 3))
        a = tensor(expected, chunk_size=(3, 3, 2))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(expected, arr.read_direct())
            assert arr.schema.cell_order == 'col-major'
    finally:
        shutil.rmtree(tempdir)
Пример #7
0
    def test_sparse_schema(self):
        ctx = tiledb.Ctx()

        # create dimensions
        d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64")
        d2 = tiledb.Dim(ctx,
                        "d2",
                        domain=(101, 10000),
                        tile=100,
                        dtype="uint64")

        # create domain
        domain = tiledb.Domain(ctx, d1, d2)

        # create attributes
        a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32")
        a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32")

        # create sparse array with schema
        schema = tiledb.SparseArray(ctx,
                                    self.path("sparse_array_schema"),
                                    domain=domain,
                                    attrs=(a1, a2),
                                    capacity=10,
                                    cell_order='col-major',
                                    tile_order='row-major',
                                    coords_compressor=('zstd', 4),
                                    offsets_compressor=('blosc-lz', 5))
        self.assertEqual(schema.capacity, 10)
        self.assertEqual(schema.cell_order, "col-major")
        self.assertEqual(schema.tile_order, "row-major")
        self.assertEqual(schema.coords_compressor, ('zstd', 4))
        self.assertEqual(schema.offsets_compressor, ('blosc-lz', 5))
Пример #8
0
def test_ingest_csv_sparse_array_null_replace(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    """
    From a CSV file containing NaNs, produce a sparse array using ingest_csv()
    where the NaNs are replaced with the value given by fillna.
    """
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment_nulls"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        fillna=123,
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.SparseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        assert_array_equal(data["a"], np.array([1, 1, 1]))
        assert_array_equal(data["b"], np.array([2, 2, 123]))
        assert_array_equal(data["c"], np.array([3, 123, 123]))
Пример #9
0
    def __init__(self, array, mode="r", subsample_rate=None):
        """subsample_rate is the probability of *keeping* an entry."""
        if array is None:
            # empty array -- this is used by DNAFragMultiArray
            self._arr = None
        elif isinstance(array, DNAFragArray):
            # copy construcor -- this is used by DNAFragMultiArray

            # TODO: this is "dangerous" shallow copy of a filehandleself.
            # We should probably open a new handle on the path, in case the
            # "other" array is destroyed and its file handle closes.
            self._arr = array._arr
            self.subsample_rate = array.subsample_rate
        elif isinstance(array, tiledb.SparseArray):
            self._arr = array
        elif isinstance(array, NumpyBackend):  # used for testing
            self._arr = array
        else:
            assert os.path.exists(array)
            self._arr = tiledb.SparseArray(array, ctx=ctx, mode="r")

        if self._arr is not None:
            (n, m) = self._arr.shape
            nonempty = self._arr.nonempty_domain()

            if nonempty is None:
                raise ValueError("Array is empty")

            # NB: NB: this is not necessary with tiledb >= v0.2.1
            # NB: this seems to be necessary to prevent the array appearing empty
            # _ = self._arr.query(attrs=[COUNTS_RANGE_NAME], coords=True)[:, :]

        if subsample_rate is not None:
            assert subsample_rate >= 0. and subsample_rate <= 1.
        self.subsample_rate = subsample_rate
Пример #10
0
    def execute(cls, ctx, op):
        tiledb_ctx = get_tiledb_ctx(op.tiledb_config)
        uri = op.tiledb_uri
        key = op.tiledb_key
        timestamp = op.tiledb_timestamp
        axis_offsets = op.axis_offsets

        chunk = op.outputs[0]
        if not chunk.issparse():
            # dense
            to_store = np.ascontiguousarray(ctx[op.input.key])
            slcs = []
            for axis in range(chunk.ndim):
                axis_offset = int(axis_offsets[axis])
                axis_length = int(op.input.shape[axis])
                slcs.append(slice(axis_offset, axis_offset + axis_length))
            with tiledb.DenseArray(uri=uri, ctx=tiledb_ctx, mode='w',
                                   key=key, timestamp=timestamp) as arr:
                arr[tuple(slcs)] = to_store
            ctx[chunk.key] = np.empty((0,) * chunk.ndim, dtype=chunk.dtype)
        else:
            # sparse
            to_store = ctx[op.input.key].spmatrix.tocoo()
            if to_store.nnz > 0:
                with tiledb.SparseArray(uri=uri, ctx=tiledb_ctx, mode='w',
                                        key=key, timestamp=timestamp) as arr:
                    if chunk.ndim == 1:
                        vec = to_store.col if to_store.shape[0] == 1 else to_store.row
                        vec += axis_offsets[0]
                        arr[vec] = to_store.data
                    else:
                        i, j = to_store.row + axis_offsets[0], to_store.col + axis_offsets[1]
                        arr[i, j] = to_store.data
            ctx[chunk.key] = SparseNDArray(sps.csr_matrix((0, 0), dtype=chunk.dtype),
                                           shape=chunk.shape)
Пример #11
0
def read_array_cloud_local(rest_address, array_uri, token):
    tiledb.cloud.login(host=rest_address, token=token)

    from_time = int(get_timestamp_from_text("2020-02-17 00:00:00"))
    print(from_time)
    to_time = int(get_timestamp_from_text("2020-02-17 23:59:59"))
    print(to_time)

    with tiledb.SparseArray(array_uri, ctx=tiledb.cloud.Ctx()) as A:
        # res = np.mean(A[:]["importance"])
        # print(res)

        # Returns all articles, if extreme bounds are added to regions,
        # there are less results
        res = A[:, from_time:to_time, :]
        # res = A[1:5000, from_time:to_time, 400000000:500000000]
        # <- for 2020-02-17 there are 217 results instead of 268

        # Returns all articles, need +1 because upper bound of the slice is not
        # inclusive
        # res = A[:, :, 443392586:443557994+1]

        article_dict = {}
        for i in range(len(res["title"])):
            article_dict[res["coords"][i][2]] = (
                get_datetime_from_timestamp(res["coords"][i][1]),
                res["medium_text"][i],
                res["title"][i],
            )

        for i in sorted(article_dict.keys()):
            print("%s: %s" % (i, article_dict[i]))

        print(len(res["title"]))
Пример #12
0
def write_array_sparse():
    # Open the array and write to it.
    with tiledb.SparseArray(array_name_sp, mode='w') as A:
        # Write some simple data to cells (1, 1), (2, 4) and (2, 3).
        I, J = [1, 2, 2], [1, 4, 3]
        data = np.array(([1, 2, 3]))
        A[I, J] = data
Пример #13
0
def read_array():
    ctx = tiledb.Ctx()
    with tiledb.SparseArray(ctx, array_name, mode='r') as A:
        data = A[1:11]
        a_vals = data["a"]
        for i, coord in enumerate(data["coords"]):
            print("Cell (%d, %d) has data %d" %
                  (coord[0], coord[1], a_vals[i]))
Пример #14
0
    def execute(cls, ctx, op):
        import tiledb
        chunk = op.outputs[0]
        from ..array_utils import array_module
        from ..utils import get_tiledb_ctx

        xp = array_module(op.gpu)

        axis_offsets = [
            offset + dim_start
            for offset, dim_start in zip(op.axis_offsets, op.tiledb_dim_starts)
        ]
        tiledb_ctx = get_tiledb_ctx(op.tiledb_config)
        uri = op.tiledb_uri
        key = op.tiledb_key
        timestamp = op.tiledb_timestamp

        slcs = []
        for axis in range(chunk.ndim):
            axis_offset = axis_offsets[axis]
            axis_length = chunk.shape[axis]
            slcs.append(slice(axis_offset, axis_offset + axis_length))

        if not op.sparse:
            # read dense array from tiledb
            with tiledb.DenseArray(uri=uri,
                                   ctx=tiledb_ctx,
                                   key=key,
                                   timestamp=timestamp) as tiledb_arr:
                ctx[chunk.key] = tiledb_arr[tuple(slcs)]
        else:
            # read sparse array from tiledb
            with tiledb.SparseArray(uri=uri,
                                    ctx=tiledb_ctx,
                                    key=key,
                                    timestamp=timestamp) as tiledb_arr:
                if tiledb_arr.ndim > 2:
                    raise NotImplementedError(
                        'Does not support to read array with more than 2 dimensions'
                    )

                data = tiledb_arr[tuple(slcs)]
                coords = data['coords']

                value = data[tiledb_arr.attr(0).name]
                if tiledb_arr.ndim == 2:
                    # 2-d
                    ij = tuple(coords[tiledb_arr.domain.dim(k).name] -
                               axis_offsets[k] for k in range(tiledb_arr.ndim))
                    spmatrix = sps.coo_matrix((value, ij), shape=chunk.shape)
                    ctx[chunk.key] = SparseNDArray(spmatrix)
                else:
                    # 1-d
                    ij = xp.zeros(coords.shape), \
                         coords[tiledb_arr.domain.dim(0).name] - axis_offsets[0]
                    spmatrix = sps.coo_matrix((value, ij),
                                              shape=(1, ) + chunk.shape)
                    ctx[chunk.key] = SparseNDArray(spmatrix, shape=chunk.shape)
Пример #15
0
def read_array_sparse():
    # Open the array and read from it.
    with tiledb.SparseArray(array_name_sp, mode='r') as A:
        # Slice only rows 1, 2 and cols 2, 3, 4.
        data = A[1:3, 2:5]
        a_vals = data["a"]
        for i, coord in enumerate(data["coords"]):
            print("Cell (%d, %d) has data %d" %
                  (coord[0], coord[1], a_vals[i]))
Пример #16
0
def read_array_s3(rest_adress, array_uri, token):
    config = tiledb.Config()
    config["rest.token"] = token
    config["rest.server_address"] = rest_adress
    config["vfs.s3.region"] = "eu-central-1"

    ctx = tiledb.Ctx(config)
    with tiledb.SparseArray(array_uri, ctx=ctx) as A:
        print(A[:]["title"])
Пример #17
0
def write_array():
    ctx = tiledb.Ctx()
    with tiledb.SparseArray(ctx, array_name, mode='w') as A:
        I, J = [1, 2, 2], [1, 4, 3]
        data = np.array(([1, 2, 3]))
        A[I, J] = data

        I, J = [4, 2], [1, 4]
        data = np.array(([4, 20]))
        A[I, J] = data
Пример #18
0
def exec_and_fetch(
    query,
    output_uri,
    output_schema=None,
    namespace=None,
    task_name=None,
    output_array_name=None,
    init_commands=None,
    parameters=None,
):
    """
    Run a sql query, results are not stored
    :param str query: query to run
    :param str output_uri: array to store results to, must be either a tiledb:// for an already registered array or a s3:// if passing a new schema to create new output array
    :param tiledb.ArraySchema output_schema: array schema to create output array with
    :param str namespace: optional namespace to charge the query to
    :param str task_name: optional name to assign the task for logging and audit purposes
    :param str output_array_name: optional name for registering new output array if output_schema schema is passed
    :param list init_commands: optional list of sql queries or commands to run before main query
    :param list parameters: optional list of sql parameters for use in query

    :return: TileDB Array with results
    """

    # If the namespace is not set, we will default to the user's namespace
    if namespace is None:
        # Fetch the client profile for username if it is not already cached
        if config.user is None:
            config.user = client.user_profile()

        namespace = config.user.username

    # Execute the sql query
    try:
        exec(
            query=query,
            output_uri=output_uri,
            output_schema=output_schema,
            namespace=namespace,
            task_name=task_name,
            output_array_name=output_array_name,
            init_commands=init_commands,
            parameters=parameters,
        )

        # Fetch output schema to check if its sparse or dense
        schema = tiledb.ArraySchema.load(output_uri, ctx=client.Ctx())

        if schema.sparse:
            return tiledb.SparseArray(output_uri, ctx=client.Ctx())

        return tiledb.DenseArray(output_uri, ctx=client.Ctx())

    except GenApiException as exc:
        raise tiledb_cloud_error.check_exc(exc) from None
Пример #19
0
def read_array(order):
    ctx = tiledb.Ctx()
    with tiledb.SparseArray(ctx, array_name, mode='r') as A:
        print("Non-empty domain: {}".format(A.nonempty_domain()))

        data = A.query(attrs=["a"], order=order, coords=True)[1:100]
        a_vals = data["a"]
        coords = data["coords"]

        for i in range(coords.shape[0]):
            print("Cell {} has data {}".format(str(coords[i]), str(a_vals[i])))
Пример #20
0
def main():

    ctx = tiledb.Ctx()

    # create dimensions
    d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64")
    d2 = tiledb.Dim(ctx, "d2", domain=(101, 10000), tile=100, dtype="uint64")

    # create domain
    domain = tiledb.Domain(ctx, d1, d2)

    # create attributes
    a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32")
    a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32")

    # create sparse array with schema
    schema = tiledb.SparseArray(ctx,
                                "sparse_array_schema",
                                domain=domain,
                                attrs=(a1, a2),
                                capacity=10,
                                tile_order='row-major',
                                cell_order='col-major',
                                coords_compressor=('zstd', 4),
                                offsets_compressor=('blosc-lz', 5))
    schema.dump()

    # Print from schema
    print("From schema properties:")
    print("- Array type: ", "sparse" if schema.sparse else "dense")
    print("- Cell order: ", schema.cell_order)
    print("- Tile order: ", schema.tile_order)
    print("- Capacity: ", schema.capacity)
    print("- Coordinates compressor: ", schema.coords_compressor)
    print("- Offsets compressor: ", schema.offsets_compressor)
    print()

    # Print the attribute names:
    print("Array schema attribute names: ")
    for i in range(schema.nattr):
        print("* {!r}".format(schema.attr(i).name))
    print()

    # Print domain
    domain = schema.domain
    domain.dump()

    # print the dimension names
    print("Array schema dimension names: ")
    for i in range(schema.ndim):
        dim = domain.dim(i)
        print("* {!r}".format(dim.name))
    print()
Пример #21
0
def fromtiledb(uri, ctx=None, key=None, timestamp=None, gpu=False):
    import tiledb

    raw_ctx = ctx
    if raw_ctx is None:
        ctx = tiledb.Ctx()

    # get metadata from tiledb
    try:
        tiledb_arr = tiledb.DenseArray(uri=uri,
                                       ctx=ctx,
                                       key=key,
                                       timestamp=timestamp)
        sparse = False
    except ValueError:
        # if the array is not dense, ValueError will be raised by tiledb
        tiledb_arr = tiledb.SparseArray(uri=uri,
                                        ctx=ctx,
                                        key=key,
                                        timestamp=timestamp)
        sparse = True

    if tiledb_arr.nattr > 1:
        raise NotImplementedError("Does not supported TileDB array schema "
                                  "with more than 1 attr")
    tiledb_dim_starts = tuple(
        tiledb_arr.domain.dim(j).domain[0].item()
        for j in range(tiledb_arr.ndim))
    if any(isinstance(s, float) for s in tiledb_dim_starts):
        raise ValueError("Does not support TileDB array schema "
                         "whose dimensions has float domain")

    dtype = tiledb_arr.attr(0).dtype
    tiledb_config = None if raw_ctx is None else ctx.config().dict()
    tensor_order = (TensorOrder.C_ORDER if tiledb_arr.schema.cell_order
                    == "row-major" else TensorOrder.F_ORDER)
    op = TensorTileDBDataSource(
        tiledb_config=tiledb_config,
        tiledb_uri=uri,
        tiledb_key=key,
        tiledb_timstamp=timestamp,
        tiledb_dim_starts=tiledb_dim_starts,
        gpu=gpu,
        sparse=sparse,
        dtype=dtype,
    )
    chunk_size = tuple(
        int(tiledb_arr.domain.dim(i).tile)
        for i in range(tiledb_arr.domain.ndim))
    return op(tiledb_arr.shape, chunk_size=chunk_size, order=tensor_order)
Пример #22
0
def read_array_cloud(rest_address, array_uri, token):
    tiledb.cloud.login(host=rest_address, token=token)

    from_time = int(get_timestamp_from_text("2020-02-04 00:00:00"))
    print(from_time)
    to_time = int(get_timestamp_from_text("2020-02-04 23:59:59"))
    print(to_time)

    with tiledb.SparseArray(array_uri, ctx=tiledb.cloud.Ctx()) as A:
        res = A.apply(
            mean,
            [(1, 5000), (from_time, to_time), (400000000, 500000000)],
            attrs=["importance"],
        )
        print(res)
Пример #23
0
def create_test_array_sparse_25x12(temp_rootdir):
    """
    Create a simple sparse test array.
    """
    path = os.path.abspath(os.path.join(temp_rootdir, "sparse_25x12"))

    ctx = tiledb.default_ctx()
    rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64)
    cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64)
    dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx)
    att = tiledb.Attr(ctx=ctx, dtype=np.int64)
    schema = tiledb.ArraySchema(ctx=ctx, sparse=True, domain=dom, attrs=(att,))

    tiledb.SparseArray.create(path, schema)

    with tiledb.SparseArray(path, mode="w") as A:
        coords = np.array(list(itertools.product(np.arange(1, 26), np.arange(1, 13))))
        rows = coords[:, 0]
        cols = coords[:, 1]
        A[rows, cols] = np.arange(300)
Пример #24
0
def write_sparse_array(path, n, m, n_idxs, m_idxs, values, clip=True):
    if os.path.exists(path):
        raise FileExistsError("{} already exists".format(path))

    if n_idxs.min() < 0 or n_idxs.max() >= n:
        raise ValueError("row indexes must be in range [0, n - 1]")

    if m_idxs.min() < 0 or m_idxs.max() >= m:
        raise ValueError("column indexes must in in range [0, m - 1]")

    sparse = coo_matrix((values, (n_idxs, m_idxs)), dtype=np.int32)
    sparse = sparse.tocsc(copy=False).tocoo(copy=False)

    n_idxs = sparse.row
    m_idxs = sparse.col
    values = sparse.data

    if clip:
        values = np.minimum(values, VPLOT_MAX_VALUE)

    if values.min() < 0 or values.max() > VPLOT_MAX_VALUE:
        raise ValueError(
            "vplot values must be in range [0, {}]".format(VPLOT_MAX_VALUE))

    # ctx = tiledb.Ctx()

    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(
        GENOME_DOMAIN_NAME,
        domain=(0, n - 1),
        tile=n_tile_extent,
        dtype="uint32",
        ctx=ctx,
    )
    d2 = tiledb.Dim(INSERT_DOMAIN_NAME,
                    domain=(0, m - 1),
                    tile=m,
                    dtype="uint32",
                    ctx=ctx)

    domain = tiledb.Domain(d1, d2, ctx=ctx)

    v = tiledb.Attr(
        "v",
        filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]),
        dtype="uint8",
        ctx=ctx,
    )

    schema = tiledb.ArraySchema(
        ctx=ctx,
        domain=domain,
        attrs=(v, ),
        capacity=1000,
        cell_order="row-major",
        tile_order="row-major",
        sparse=True,
    )

    tiledb.SparseArray.create(path, schema)

    with tiledb.SparseArray(path, mode="w", ctx=ctx) as A:
        values = values.astype(np.uint8)
        # A[n_idxs, m_idxs] = {"v": values}
        A[n_idxs, m_idxs] = values
Пример #25
0
 def _open_array(uri, tiledb_ctx):
     with tiledb.Array(uri, mode="r", ctx=tiledb_ctx) as array:
         if array.schema.sparse:
             return tiledb.SparseArray(uri, mode="r", ctx=tiledb_ctx)
         else:
             return tiledb.DenseArray(uri, mode="r", ctx=tiledb_ctx)
Пример #26
0
def write_array():
    ctx = tiledb.Ctx()
    with tiledb.SparseArray(ctx, array_name, mode='w') as A:
        I, J = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        data = np.array(([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        A[I, J] = data
Пример #27
0
def convert_matrix_to_cxg_array(matrix_name,
                                matrix,
                                encode_as_sparse_array,
                                ctx,
                                column_shift_for_sparse_encoding=None):
    """
    Converts a numpy array matrix into a TileDB SparseArray of DenseArray based on whether `encode_as_sparse_array`
    is true or not. Note that when the matrix is encoded as a SparseArray, it only writes the values that are
    nonzero. This means that if you count the number of elements in the SparseArray, it will not equal the total
    number of elements in the matrix, only the number of nonzero elements.

    Furthermore, if the `column_shift_for_sparse_encoding` matrix is not None, this function will subtract the sparse
    encoding from the original given matrix and as previously stated, only write the nonzero values to the TileDB
    SparseArray.
    """
    def create_matrix_array(matrix_name, number_of_rows, number_of_columns,
                            encode_as_sparse_array):
        filters = tiledb.FilterList([tiledb.ZstdFilter()])
        attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
        if encode_as_sparse_array:
            domain = tiledb.Domain(
                tiledb.Dim(name="obs",
                           domain=(0, number_of_rows - 1),
                           tile=min(number_of_rows, 512),
                           dtype=np.uint32),
                tiledb.Dim(name="var",
                           domain=(0, number_of_columns - 1),
                           tile=min(number_of_columns, 2048),
                           dtype=np.uint32),
            )
        else:
            domain = tiledb.Domain(
                tiledb.Dim(name="obs",
                           domain=(0, number_of_rows - 1),
                           tile=min(number_of_rows, 50),
                           dtype=np.uint32),
                tiledb.Dim(name="var",
                           domain=(0, number_of_columns - 1),
                           tile=min(number_of_columns, 100),
                           dtype=np.uint32),
            )
        schema = tiledb.ArraySchema(domain=domain,
                                    sparse=encode_as_sparse_array,
                                    attrs=attrs,
                                    cell_order="row-major",
                                    tile_order="col-major")
        if encode_as_sparse_array:
            tiledb.SparseArray.create(matrix_name, schema)
        else:
            tiledb.DenseArray.create(matrix_name, schema)

    number_of_rows = matrix.shape[0]
    number_of_columns = matrix.shape[1]
    stride = min(
        int(np.power(10, np.around(np.log10(1e9 / number_of_columns)))),
        10_000)

    create_matrix_array(matrix_name, number_of_rows, number_of_columns,
                        encode_as_sparse_array)

    if encode_as_sparse_array:
        with tiledb.SparseArray(matrix_name, mode="w", ctx=ctx) as array:
            for start_row_index in range(0, number_of_rows, stride):
                end_row_index = min(start_row_index + stride, number_of_rows)
                matrix_subset = matrix[start_row_index:end_row_index, :]
                if not isinstance(matrix_subset, np.ndarray):
                    matrix_subset = matrix_subset.toarray()
                if column_shift_for_sparse_encoding is not None:
                    matrix_subset = matrix_subset - column_shift_for_sparse_encoding
                indices = np.nonzero(matrix_subset)
                trow = indices[0] + start_row_index
                array[trow, indices[1]] = matrix_subset[indices[0], indices[1]]

    else:
        with tiledb.DenseArray(matrix_name, mode="w", ctx=ctx) as array:
            for start_row_index in range(0, number_of_rows, stride):
                end_row_index = min(start_row_index + stride, number_of_rows)
                matrix_subset = matrix[start_row_index:end_row_index, :]
                if not isinstance(matrix_subset, np.ndarray):
                    matrix_subset = matrix_subset.toarray()
                array[start_row_index:end_row_index, :] = matrix_subset
Пример #28
0
    def testReadTileDBExecution(self):
        ctx = tiledb.Ctx()

        tempdir = tempfile.mkdtemp()
        try:
            # create TileDB dense array
            dom = tiledb.Domain(
                tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32),
                tiledb.Dim(ctx=ctx, domain=(0, 90), tile=22, dtype=np.int32),
                tiledb.Dim(ctx=ctx, domain=(0, 9), tile=8, dtype=np.int32),
                ctx=ctx,
            )
            schema = tiledb.ArraySchema(
                ctx=ctx,
                domain=dom,
                sparse=False,
                attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)])
            tiledb.DenseArray.create(tempdir, schema)

            expected = np.random.rand(100, 91, 10)
            with tiledb.DenseArray(uri=tempdir, ctx=ctx, mode='w') as arr:
                arr.write_direct(expected)

            a = fromtiledb(tempdir, ctx=ctx)
            result = self.executor.execute_tensor(a, concat=True)[0]

            np.testing.assert_allclose(expected, result)
        finally:
            shutil.rmtree(tempdir)

        tempdir = tempfile.mkdtemp()
        try:
            # create 2-d TileDB sparse array
            dom = tiledb.Domain(
                tiledb.Dim(ctx=ctx, domain=(0, 99), tile=30, dtype=np.int32),
                tiledb.Dim(ctx=ctx, domain=(2, 11), tile=8, dtype=np.int32),
                ctx=ctx,
            )
            schema = tiledb.ArraySchema(
                ctx=ctx,
                domain=dom,
                sparse=True,
                attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)])
            tiledb.SparseArray.create(tempdir, schema)

            expected = sps.rand(100, 10, density=0.01)
            with tiledb.SparseArray(uri=tempdir, ctx=ctx, mode='w') as arr:
                I, J = expected.row, expected.col + 2
                arr[I, J] = {arr.attr(0).name: expected.data}

            a = fromtiledb(tempdir, ctx=ctx)
            result = self.executor.execute_tensor(a, concat=True)[0]

            np.testing.assert_allclose(expected.toarray(), result.toarray())
        finally:
            shutil.rmtree(tempdir)

        tempdir = tempfile.mkdtemp()
        try:
            # create 1-d TileDB sparse array
            dom = tiledb.Domain(
                tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32),
                ctx=ctx,
            )
            schema = tiledb.ArraySchema(
                ctx=ctx,
                domain=dom,
                sparse=True,
                attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)])
            tiledb.SparseArray.create(tempdir, schema)

            expected = sps.rand(1, 100, density=0.05)
            with tiledb.SparseArray(uri=tempdir, ctx=ctx, mode='w') as arr:
                I = expected.col + 1
                arr[I] = expected.data

            a = fromtiledb(tempdir, ctx=ctx)
            result = self.executor.execute_tensor(a, concat=True)[0]

            np.testing.assert_allclose(expected.toarray()[0], result.toarray())
        finally:
            shutil.rmtree(tempdir)

        tempdir = tempfile.mkdtemp()
        try:
            # create TileDB dense array with column-major
            dom = tiledb.Domain(
                tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32),
                tiledb.Dim(ctx=ctx, domain=(0, 90), tile=22, dtype=np.int32),
                tiledb.Dim(ctx=ctx, domain=(0, 9), tile=8, dtype=np.int32),
                ctx=ctx,
            )
            schema = tiledb.ArraySchema(
                ctx=ctx,
                domain=dom,
                sparse=False,
                cell_order='F',
                attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)])
            tiledb.DenseArray.create(tempdir, schema)

            expected = np.asfortranarray(np.random.rand(100, 91, 10))
            with tiledb.DenseArray(uri=tempdir, ctx=ctx, mode='w') as arr:
                arr.write_direct(expected)

            a = fromtiledb(tempdir, ctx=ctx)
            result = self.executor.execute_tensor(a, concat=True)[0]

            np.testing.assert_allclose(expected, result)
            self.assertTrue(result.flags['F_CONTIGUOUS'])
            self.assertFalse(result.flags['C_CONTIGUOUS'])
        finally:
            shutil.rmtree(tempdir)
Пример #29
0
def save_X(container, xdata, ctx, sparse_threshold, expect_sparse=False):
    # Save X count matrix
    X_name = f"{container}/X"

    shape = xdata.shape
    log(1, "\t...shape:", str(shape))

    col_shift = None
    if sparse_threshold == 100:
        is_sparse = True
    elif sparse_threshold == 0:
        is_sparse = False
    else:
        is_sparse, nnz, nelem = evaluate_for_sparse_encoding(xdata, sparse_threshold)
        percent = 100.0 * nnz / nelem
        if nelem != shape[0] * shape[1]:
            log(1, "\t...sparse=", is_sparse, "non-zeros percent (estimate): %6.2f" % percent)
        else:
            log(1, "\t...sparse=", is_sparse, "non-zeros:", nnz, "percent: %6.2f" % percent)

        is_sparse = percent < sparse_threshold
        if not is_sparse:
            col_shift, nnz, nelem = evaluate_for_sparse_column_shift_encoding(xdata, sparse_threshold)
            is_sparse = col_shift is not None
            percent = 100.0 * nnz / nelem
            if nelem != shape[0] * shape[1]:
                log(1, "\t...sparse=", is_sparse, "col shift non-zeros percent (estimate): %6.2f" % percent)
            else:
                log(1, "\t...sparse=", is_sparse, "col shift non-zeros:", nnz, "percent: %6.2f" % percent)

    if expect_sparse is True and is_sparse is False:
        return False

    create_X(X_name, shape, is_sparse)
    stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000)
    if is_sparse:
        if col_shift is not None:
            log(1, "\t...output X as sparse matrix with column shift encoding")
            X_col_shift_name = f"{container}/X_col_shift"
            filters = tiledb.FilterList([tiledb.ZstdFilter()])
            attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
            domain = tiledb.Domain(tiledb.Dim(domain=(0, shape[1] - 1), tile=min(shape[1], 5000), dtype=np.uint32))
            schema = tiledb.ArraySchema(domain=domain, attrs=attrs)
            tiledb.DenseArray.create(X_col_shift_name, schema)
            with tiledb.DenseArray(X_col_shift_name, mode="w", ctx=ctx) as X_col_shift:
                X_col_shift[:] = col_shift
            tiledb.consolidate(X_col_shift_name, ctx=ctx)
        else:
            log(1, "\t...output X as sparse matrix")

        with tiledb.SparseArray(X_name, mode="w", ctx=ctx) as X:
            nnz = 0
            for row in range(0, shape[0], stride):
                lim = min(row + stride, shape[0])
                a = xdata[row:lim, :]
                if type(a) is not np.ndarray:
                    a = a.toarray()
                if col_shift is not None:
                    a = a - col_shift
                indices = np.nonzero(a)
                trow = indices[0] + row
                nnz += indices[0].shape[0]
                X[trow, indices[1]] = a[indices[0], indices[1]]
                log(2, "\t...rows", lim, "of", shape[0], "nnz", nnz, "sparse", nnz / (lim * shape[1]))

    else:
        log(1, "\t...output X as dense matrix")
        with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X:
            for row in range(0, shape[0], stride):
                lim = min(row + stride, shape[0])
                a = xdata[row:lim, :]
                if type(a) is not np.ndarray:
                    a = a.toarray()
                X[row:lim, :] = a
                log(2, "\t...rows", row, "to", lim)

    tiledb.consolidate(X_name, ctx=ctx)
    if hasattr(tiledb, "vacuum"):
        tiledb.vacuum(X_name)

    return is_sparse
Пример #30
0
def load_sparse_array(path):
    # ctx = tiledb.Ctx()
    return tiledb.SparseArray(path, mode="r", ctx=ctx)