예제 #1
0
def uri(temp_rootdir):
    """
    Create a simple dense test array.
    """
    path = os.path.abspath(os.path.join(temp_rootdir, "test_array"))

    ctx = tiledb.default_ctx()
    rows_dim = tiledb.Dim(ctx=ctx, domain=(1, 25), dtype=np.int64)
    cols_dim = tiledb.Dim(ctx=ctx, domain=(1, 12), dtype=np.int64)
    dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx)
    att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64)
    att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64)
    schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att1, att2))

    tiledb.DenseArray.create(path, schema)

    data = np.reshape(np.arange(300), (25, 12))

    with tiledb.DenseArray(path, mode="w", timestamp=1) as A:
        A.meta["meta_int"] = 1
        A[:] = {"a": data, "b": data}

    with tiledb.DenseArray(path, mode="w", timestamp=2) as A:
        A.meta["meta_int"] = 2
        A[:] = {"a": data / 2, "b": data * 2}

    yield path

    shutil.rmtree(path)
예제 #2
0
def test_store_tiledb_execution(setup):
    ctx = tiledb.Ctx()

    tempdir = tempfile.mkdtemp()
    try:
        # store TileDB dense array
        expected = np.random.rand(8, 4, 3)
        a = tensor(expected, chunk_size=(3, 3, 2))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(expected, arr.read_direct())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store tensor with 1 chunk to TileDB dense array
        a = arange(12)
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(np.arange(12), arr.read_direct())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store 2-d TileDB sparse array
        expected = sps.random(8, 7, density=0.1)
        a = tensor(expected, chunk_size=(3, 5))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr:
            data = arr[:, :]
            coords = data['coords']
            value = data[arr.attr(0).name]
            ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim))
            result = sps.coo_matrix((value, ij), shape=arr.shape)

            np.testing.assert_allclose(expected.toarray(), result.toarray())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store TileDB dense array
        expected = np.asfortranarray(np.random.rand(8, 4, 3))
        a = tensor(expected, chunk_size=(3, 3, 2))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(expected, arr.read_direct())
            assert arr.schema.cell_order == 'col-major'
    finally:
        shutil.rmtree(tempdir)
예제 #3
0
    def testStoreTileDB(self):
        ctx = tiledb.Ctx()
        tempdir = tempfile.mkdtemp()
        try:
            t = random.rand(50, 30, chunk_size=13)
            t2 = t + 1

            saved = totiledb(tempdir, t2)
            self.assertEqual(saved.shape, (0, 0))
            self.assertIsNone(saved.op.tiledb_config)
            self.assertEquals(saved.op.tiledb_uri, tempdir)

            with self.assertRaises(tiledb.TileDBError):
                tiledb.DenseArray(ctx=ctx, uri=tempdir)

            # tiledb array is created in the tile
            saved.tiles()

            # no error
            tiledb.DenseArray(ctx=ctx, uri=tempdir)

            self.assertEqual(saved.chunks[0].op.axis_offsets, (0, 0))
            self.assertEqual(saved.chunks[1].op.axis_offsets, (0, 13))
            self.assertEqual(saved.cix[0, 2].op.axis_offsets, (0, 26))
            self.assertEqual(saved.cix[1, 2].op.axis_offsets, (13, 26))
            self.assertEqual(saved.cix[3, 2].op.axis_offsets, (39, 26))

            with self.assertRaises(ValueError):
                t3 = random.rand(30, 50)
                totiledb(tempdir, t3, ctx=ctx)  # shape incompatible
        finally:
            shutil.rmtree(tempdir)
예제 #4
0
def open(uri: Union[str, Path]) -> TileSegy:
    uri = Path(uri) if not isinstance(uri, Path) else uri
    headers = tiledb.DenseArray(str(uri / "headers"))
    data = tiledb.DenseArray(str(uri / "data"), attr="trace")
    if data.schema.domain.has_dim("traces"):
        cls = TileSegy
    else:
        cls = StructuredTileSegy
    return cls(uri, headers, data)
예제 #5
0
def write_array(args, updating, chunks_to_process):    
    try:
        #config
        tdb_Config=tiledb.Config(tdb_config_params)
        tdb_write_Context=tiledb.Ctx(config=tdb_Config)   
        
        if updating is True:
            tdb_read_Context=tiledb.Ctx(config=tdb_Config)
            cur_array_toread=tiledb.DenseArray(args.array_name,ctx=tdb_read_Context,mode='r')
        cur_array_towrite=tiledb.DenseArray(args.array_name,ctx=tdb_write_Context,mode='w')
        chunks_processed=0
        while chunks_processed < chunks_to_process:
            while write_queue.empty() is True:
                time.sleep(10)
            processed_chunk=write_queue.get()
            processed_chunk_unpickled=pickle.loads(processed_chunk)
            task_index=processed_chunk_unpickled[0]
            start_index=processed_chunk_unpickled[1]
            end_index=processed_chunk_unpickled[2]
            dict_to_write=processed_chunk_unpickled[3]
            if updating is True:
                #we are only updating some attributes in the array
                cur_vals=cur_array_toread[start_index:end_index,task_index]            
                #print("got cur vals for task "+str(task_index)+" for "+str(start_index)+":"+str(end_index))
                for key in dict_to_write:
                    cur_vals[key]=dict_to_write[key]
                dict_to_write=cur_vals
                print("updated data dict for writing:"+args.array_name) 
            else:
                #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array
                required_attrib=list(get_attribute_info(args.attribute_config,args.attribute_config_file).keys())
                #print(str(required_attrib))
                for attrib in required_attrib:
                    if attrib not in dict_to_write:
                        print("augmenting")
                        dict_to_write[attrib]=np.full(end_index-start_index,np.nan)
            #write in chunks
            cur_array_towrite[start_index:end_index,task_index]=dict_to_write
            print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2))
            gc.collect()
            chunks_processed+=1
            print("wrote to disk "+str(task_index)+" for "+str(start_index)+":"+str(end_index)+";"+str(chunks_processed)+"/"+str(chunks_to_process))
        assert chunks_processed >=chunks_to_process
        print("closing arrays")
        if updating is True:
            cur_array_toread.close()
        cur_array_towrite.close()
        return 

    except KeyboardInterrupt:
        kill_child_processes(os.getpid())
        #try to delete all tmp files
        raise
    except Exception as e:
        print(e)
        kill_child_processes(os.getpid())
        raise Exception(e.message) 
예제 #6
0
def test_ingest_csv_dense_array(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    """
    Create a dense array from a CSV file using ingest_csv().
    """
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        sparse=False,
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.DenseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        for col, attribute in enumerate(("a", "b", "c"), 1):
            assert_array_equal(
                data[attribute],
                np.array([row * 10 + col for row in range(1, 21)]),
            )
예제 #7
0
 def load_dense_array(self, arrayID):
     tile_array_id = os.path.join(self.root, arrayID)
     try:
         return tiledb.DenseArray(self.ctx, tile_array_id)
     except tiledb.TileDBError as e:
         print(e)
         return np.array([])
예제 #8
0
def create_tiledb_datetime_example(tmpdir):
    _data = np.linspace(-1.0, 20.0, num=16, endpoint=True, dtype=np.float64)
    _date = np.arange(np.datetime64("2000-01-01"), np.datetime64("2000-01-17"))
    # Create expected dataset
    expected = xr.Dataset(
        data_vars={"temperature": xr.DataArray(data=_data, dims="date")},
        coords={"date": _date},
    )
    # Create TileDB array
    array_uri = str(tmpdir.join("tiledb_example_2"))
    schema = tiledb.ArraySchema(
        domain=tiledb.Domain(
            tiledb.Dim(
                name="date",
                domain=(np.datetime64("2000-01-01"), np.datetime64("2000-01-16")),
                tile=np.timedelta64(4, "D"),
                dtype=np.datetime64("", "D"),
            ),
        ),
        attrs=[tiledb.Attr(name="temperature", dtype=np.float64)],
    )
    tiledb.DenseArray.create(array_uri, schema)
    with tiledb.DenseArray(array_uri, mode="w") as array:
        array[:] = {"temperature": _data}
    return array_uri, expected
예제 #9
0
    def execute(cls, ctx, op):
        tiledb_ctx = get_tiledb_ctx(op.tiledb_config)
        uri = op.tiledb_uri
        key = op.tiledb_key
        timestamp = op.tiledb_timestamp
        axis_offsets = op.axis_offsets

        chunk = op.outputs[0]
        if not chunk.issparse():
            # dense
            to_store = np.ascontiguousarray(ctx[op.input.key])
            slcs = []
            for axis in range(chunk.ndim):
                axis_offset = int(axis_offsets[axis])
                axis_length = int(op.input.shape[axis])
                slcs.append(slice(axis_offset, axis_offset + axis_length))
            with tiledb.DenseArray(uri=uri, ctx=tiledb_ctx, mode='w',
                                   key=key, timestamp=timestamp) as arr:
                arr[tuple(slcs)] = to_store
            ctx[chunk.key] = np.empty((0,) * chunk.ndim, dtype=chunk.dtype)
        else:
            # sparse
            to_store = ctx[op.input.key].spmatrix.tocoo()
            if to_store.nnz > 0:
                with tiledb.SparseArray(uri=uri, ctx=tiledb_ctx, mode='w',
                                        key=key, timestamp=timestamp) as arr:
                    if chunk.ndim == 1:
                        vec = to_store.col if to_store.shape[0] == 1 else to_store.row
                        vec += axis_offsets[0]
                        arr[vec] = to_store.data
                    else:
                        i, j = to_store.row + axis_offsets[0], to_store.col + axis_offsets[1]
                        arr[i, j] = to_store.data
            ctx[chunk.key] = SparseNDArray(sps.csr_matrix((0, 0), dtype=chunk.dtype),
                                           shape=chunk.shape)
예제 #10
0
    def _initialize_stat_values_store_if_needed(
            self, shape: Tuple[int, ...]) -> None:
        """
        Initialize storage for the benchmark statistics if it wasn't created yet.
        :param shape: Shape of the stats map.
        """

        if self.__tiledb_stats_array is not None and tiledb.array_exists(
                self.__tiledb_stats_array):
            return
        # Create array with one dense dimension to store read statistics from the latest benchmark run.
        dom = tiledb.Domain(
            tiledb.Dim(name='n',
                       domain=(0, shape[0] - 1),
                       tile=shape[0] - 1,
                       dtype=np.int64),
            tiledb.Dim(name='f',
                       domain=(0, shape[1] - 1),
                       tile=(shape[1] - 1),
                       dtype=np.int64))
        # Schema contains one attribute for READ count
        schema = tiledb.ArraySchema(
            domain=dom,
            sparse=False,
            attrs=[tiledb.Attr(name='read', dtype=np.int32)])
        # Create the (empty) array on disk.
        tiledb.DenseArray.create(self.__tiledb_stats_array, schema)
        # Fill with zeroes
        with tiledb.DenseArray(self.__tiledb_stats_array, mode='w') as rr:
            zero_data = np.zeros(shape, dtype=np.int32)
            rr[:] = zero_data
예제 #11
0
 def _write_stats(self, stats: np.ndarray) -> None:
     """
     Write benchmark stats to the local storage
     :param stats: Expected array must have shape (num of nodes, num of fragments, 1)
     """
     with tiledb.DenseArray(self.__tiledb_stats_array, mode='w') as rr:
         rr[:] = stats
예제 #12
0
 def _tiledb_array(self, uri: str,
                   schema: tiledb.ArraySchema) -> Iterator[tiledb.Array]:
     tiledb.DenseArray.create(uri, schema)
     with tiledb.DenseArray(uri, mode="w") as tdb:
         yield tdb
     tiledb.consolidate(uri, config=self.config)
     tiledb.vacuum(uri, config=self.config)
예제 #13
0
def convert_ndarray_to_cxg_dense_array(ndarray_name, ndarray, ctx):
    """
    Saves contents of ndarray to the CXG output directory specified.

    Generally this function is used to convert dataset embeddings. Because embeddings are typically accessed with
    very large slices (or all of the embedding), they do not benefit from overly aggressive compression due to their
    format.  Given this, we use a large tile size (1000) but only default compression level.
    """
    def create_ndarray_array(ndarray_name, ndarray):
        filters = tiledb.FilterList([tiledb.ZstdFilter()])
        attrs = [tiledb.Attr(dtype=ndarray.dtype, filters=filters)]
        dimensions = [
            tiledb.Dim(domain=(0, ndarray.shape[dimension] - 1),
                       tile=min(ndarray.shape[dimension], 1000),
                       dtype=np.uint32) for dimension in range(ndarray.ndim)
        ]
        domain = tiledb.Domain(*dimensions)
        schema = tiledb.ArraySchema(domain=domain,
                                    sparse=False,
                                    attrs=attrs,
                                    capacity=1_000_000,
                                    cell_order="row-major",
                                    tile_order="row-major")
        tiledb.DenseArray.create(ndarray_name, schema)

    create_ndarray_array(ndarray_name, ndarray)

    with tiledb.DenseArray(ndarray_name, mode="w", ctx=ctx) as array:
        array[:] = ndarray

    tiledb.consolidate(ndarray_name, ctx=ctx)
예제 #14
0
def write_array():
    # Open the array and write to it.
    with tiledb.DenseArray(array_name, mode='w') as A:
        data = np.array(([1, 2, 3, 4],
                         [5, 6, 7, 8],
                         [9, 10, 11, 12],
                         [13, 14, 15, 16]))
        A[:] = data
예제 #15
0
def threadtest_create_array(
    uri,
):
    data = np.random.rand(20)
    schema = tiledb.libtiledb.schema_like(data)
    tiledb.Array.create(uri, schema)
    with tiledb.DenseArray(uri, "w") as A:
        A[:] = data
예제 #16
0
def calculate_change(_input,
                     bands,
                     window,
                     x,
                     y,
                     tile_x_size,
                     tile_y_size,
                     output,
                     config=None):
    # assuming average reflectivities in the entire two images are ~ equal
    # https://prod-ng.sandia.gov/techlib-noauth/access-control.cgi/2014/1418179.pdf
    # noise terms are known and are zero (uavsar, extend as we add additional sensors)
    cfg = tiledb.Config(config)
    ctx = tiledb.Ctx(config=cfg)
    with tiledb.DenseArray(output, 'w', ctx=ctx) as arr_output:
        with tiledb.DenseArray(_input, 'r', ctx=ctx) as arr:
            start_y = y * tile_y_size
            end_y = start_y + tile_y_size
            start_x = x * tile_x_size
            end_x = start_x + tile_x_size

            data = arr.query(attrs=['TDB_VALUES'])[:, start_y:end_y,
                                                   start_x:end_x]  # noqa
            tile = data["TDB_VALUES"]
            out_tile = np.ones((tile.shape[1], tile.shape[2]),
                               dtype=np.float32)  # noqa

            y1 = 0

            while y1 < tile_y_size:
                x1 = 0
                y1_end = y1 + window
                while x1 < tile_x_size:
                    x1_end = x1 + window
                    t1 = tile[0, y1:y1_end, x1:x1_end]
                    t2 = tile[1, y1:y1_end, x1:x1_end]

                    # write result to tiledb output array
                    out_tile[y1:y1_end, x1:x1_end] = local_ccd(t1, t2)  # noqa

                    x1 = x1 + window
                y1 = y1 + window

            # write out result tile
            arr_output[start_y:end_y, start_x:end_x] = out_tile
    return True
예제 #17
0
    def execute(cls, ctx, op):
        import tiledb
        chunk = op.outputs[0]
        from ..array_utils import array_module
        from ..utils import get_tiledb_ctx

        xp = array_module(op.gpu)

        axis_offsets = [
            offset + dim_start
            for offset, dim_start in zip(op.axis_offsets, op.tiledb_dim_starts)
        ]
        tiledb_ctx = get_tiledb_ctx(op.tiledb_config)
        uri = op.tiledb_uri
        key = op.tiledb_key
        timestamp = op.tiledb_timestamp

        slcs = []
        for axis in range(chunk.ndim):
            axis_offset = axis_offsets[axis]
            axis_length = chunk.shape[axis]
            slcs.append(slice(axis_offset, axis_offset + axis_length))

        if not op.sparse:
            # read dense array from tiledb
            with tiledb.DenseArray(uri=uri,
                                   ctx=tiledb_ctx,
                                   key=key,
                                   timestamp=timestamp) as tiledb_arr:
                ctx[chunk.key] = tiledb_arr[tuple(slcs)]
        else:
            # read sparse array from tiledb
            with tiledb.SparseArray(uri=uri,
                                    ctx=tiledb_ctx,
                                    key=key,
                                    timestamp=timestamp) as tiledb_arr:
                if tiledb_arr.ndim > 2:
                    raise NotImplementedError(
                        'Does not support to read array with more than 2 dimensions'
                    )

                data = tiledb_arr[tuple(slcs)]
                coords = data['coords']

                value = data[tiledb_arr.attr(0).name]
                if tiledb_arr.ndim == 2:
                    # 2-d
                    ij = tuple(coords[tiledb_arr.domain.dim(k).name] -
                               axis_offsets[k] for k in range(tiledb_arr.ndim))
                    spmatrix = sps.coo_matrix((value, ij), shape=chunk.shape)
                    ctx[chunk.key] = SparseNDArray(spmatrix)
                else:
                    # 1-d
                    ij = xp.zeros(coords.shape), \
                         coords[tiledb_arr.domain.dim(0).name] - axis_offsets[0]
                    spmatrix = sps.coo_matrix((value, ij),
                                              shape=(1, ) + chunk.shape)
                    ctx[chunk.key] = SparseNDArray(spmatrix, shape=chunk.shape)
예제 #18
0
def test_ingest_csv_dense_array_apppend(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        mode="ingest",
        full_domain=True,
        sparse=False,
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.DenseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])
        number_of_rows = data.shape[0]
        assert number_of_rows == 20

    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, array_name),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        mode="append",
        row_start_idx=number_of_rows,
        name=udf_uri,  # unittest/test_ingest_csv --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.DenseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        for col, attribute in enumerate(("a", "b", "c"), 1):
            assert_array_equal(
                data[attribute],
                np.array([row * 10 + col for row in range(1, 21)] * 2),
            )
예제 #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="input cxg directory")
    parser.add_argument("output", help="output cxg directory")
    parser.add_argument("--overwrite",
                        action="store_true",
                        help="replace output cxg directory")
    parser.add_argument("--verbose",
                        "-v",
                        action="count",
                        default=0,
                        help="verbose output")
    parser.add_argument(
        "--sparse-threshold",
        "-s",
        type=float,
        default=5.0,  # default is 5% non-zero values
        help=
        "The X array will be sparse if the percent of non-zeros falls below this value",
    )
    args = parser.parse_args()

    if os.path.exists(args.output):
        print("output dir exists:", args.output)
        if args.overwrite:
            print("output dir removed:", args.output)
            shutil.rmtree(args.output)
        else:
            print("use the overwrite option to remove the output directory")
            sys.exit(1)

    if not os.path.isdir(args.input):
        print("input is not a directory", args.input)
        sys.exit(1)

    shutil.copytree(args.input,
                    args.output,
                    ignore=shutil.ignore_patterns("X", "X_col_shift"))

    ctx = tiledb.Ctx({
        "sm.num_reader_threads": 32,
        "sm.num_writer_threads": 32,
        "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024,
    })

    with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r",
                           ctx=ctx) as X_in:
        is_sparse = cxgtool.save_X(args.output,
                                   X_in,
                                   ctx,
                                   args.sparse_threshold,
                                   expect_sparse=True)

    if is_sparse is False:
        print("The array is not sparse, cleaning up, abort.")
        shutil.rmtree(args.output)
        sys.exit(1)
예제 #20
0
def save_embeddings(container, adata, ctx):
    for (name, value) in adata.obsm.items():
        if is_valid_embedding(adata, name, value):
            e_name = f"{container}/{name[2:]}"
            create_emb(e_name, value)
            with tiledb.DenseArray(e_name, mode="w", ctx=ctx) as A:
                A[:] = value
            tiledb.consolidate(e_name, ctx=ctx)
            log(1, f"\t\t...{name} embedding created")
예제 #21
0
def get_loss_weights(tdb_path,chrom,label_attribute,ambig_attribute,upsample_attribute,tdb_partition_thresh_for_upsample):
    import tiledb
    from kerasAC.tiledb_config import get_default_config
    import pdb 
    tdb_config=get_default_config()
    ctx=tiledb.Ctx(tdb_config)
    tdb_array=tiledb.DenseArray(tdb_path+"."+chrom,mode='r',ctx=ctx)
    print("opened:"+tdb_path+"."+chrom+" for reading")
    vals=tdb_array[:]
    print("got tdb vals")
예제 #22
0
def exec_and_fetch(
    query,
    output_uri,
    output_schema=None,
    namespace=None,
    task_name=None,
    output_array_name=None,
    init_commands=None,
    parameters=None,
):
    """
    Run a sql query, results are not stored
    :param str query: query to run
    :param str output_uri: array to store results to, must be either a tiledb:// for an already registered array or a s3:// if passing a new schema to create new output array
    :param tiledb.ArraySchema output_schema: array schema to create output array with
    :param str namespace: optional namespace to charge the query to
    :param str task_name: optional name to assign the task for logging and audit purposes
    :param str output_array_name: optional name for registering new output array if output_schema schema is passed
    :param list init_commands: optional list of sql queries or commands to run before main query
    :param list parameters: optional list of sql parameters for use in query

    :return: TileDB Array with results
    """

    # If the namespace is not set, we will default to the user's namespace
    if namespace is None:
        # Fetch the client profile for username if it is not already cached
        if config.user is None:
            config.user = client.user_profile()

        namespace = config.user.username

    # Execute the sql query
    try:
        exec(
            query=query,
            output_uri=output_uri,
            output_schema=output_schema,
            namespace=namespace,
            task_name=task_name,
            output_array_name=output_array_name,
            init_commands=init_commands,
            parameters=parameters,
        )

        # Fetch output schema to check if its sparse or dense
        schema = tiledb.ArraySchema.load(output_uri, ctx=client.Ctx())

        if schema.sparse:
            return tiledb.SparseArray(output_uri, ctx=client.Ctx())

        return tiledb.DenseArray(output_uri, ctx=client.Ctx())

    except GenApiException as exc:
        raise tiledb_cloud_error.check_exc(exc) from None
예제 #23
0
def process_chrom(data_dict,attribute_info,chrom,size,array_out_name,updating,args):
    attribute_config=args.attribute_config
    dict_to_write=OrderedDict()
    
    for attribute in data_dict:
        cur_parser=attribute_info[attribute]['parser']
        dict_to_write[attribute]=cur_parser(data_dict[attribute],chrom,0,size,attribute_info[attribute])
        print("got:"+str(attribute)+" for chrom:"+str(chrom))

    if updating is True:
        #we are only updating some attributes in the array
        with tiledb.DenseArray(array_out_name,mode='r',ctx=tdb_Context) as cur_array:
        #with tiledb.DenseArray(array_out_name,mode='r',ctx=tiledb.Ctx(config=tdb_Config)) as cur_array:
            cur_vals=cur_array[:]            
        del cur_array
        print('got cur vals for'+array_out_name) 
        for key in dict_to_write:
            cur_vals[key]=dict_to_write[key]
        dict_to_write=cur_vals
        print("updated data dict for writing:"+array_out_name) 
    else:
        #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array
        required_attrib=list(get_attribute_info(attribute_config).keys())
        for attrib in required_attrib:
            if attrib not in dict_to_write:
                dict_to_write[attrib]=np.full(size,np.nan)
    with tiledb.DenseArray(array_out_name,ctx=tdb_Context,mode='w') as out_array:
    #with tiledb.DenseArray(array_out_name,ctx=tiledb.Ctx(config=tdb_Config),mode='w') as out_array:
        if args.write_chunk is None:
            #write the full chromosome 
            out_array[:]=dict_to_write
        else:
            #write in chunks
            for chunk_index in range(0,size+args.write_chunk,args.write_chunk):
                start_pos=chunk_index
                if start_pos<size: 
                    end_pos=min([size,chunk_index+args.write_chunk])
                    out_array[start_pos:end_pos]=get_subdict(dict_to_write,start_pos,end_pos)
                    print("wrote:"+str(start_pos)+"-"+str(end_pos)+ " for:"+array_out_name)
    del out_array 
    gc.collect() 
    print("wrote to disk:"+array_out_name)
예제 #24
0
파일: test_tiledb.py 프로젝트: ofnote/aos
def read_array():
    # Open the array and read from it.
    with tiledb.DenseArray(array_name, mode='r') as A:
        # Slice only rows 1, 2 and cols 2, 3, 4.
        data = A[1:3, 2:5]
        print(data["a"])

        sch = A.schema
        #print (sch.domain, sch.ndim, sch.)
        #print (sch.capacity)
        sch.dump()
예제 #25
0
def ccd(_input, bands, output=None, config=None, neighbourhood=7, overlap=1):
    if len(bands) == 2:
        if output is None or not os.path.exists(output):
            cfg = tiledb.Config(config)
            ctx = tiledb.Ctx(config=cfg)
            with tiledb.DenseArray(_input, 'r', ctx=ctx) as arr:
                y_dim = arr.schema.domain.dim(1)
                x_dim = arr.schema.domain.dim(2)
                height = y_dim.size
                width = x_dim.size
                tile_y_size = y_dim.tile
                tile_x_size = x_dim.tile

            dom = tiledb.Domain(
                tiledb.Dim(domain=(0, height - 1),
                           tile=tile_y_size,
                           dtype=np.uint64),
                tiledb.Dim(domain=(0, width - 1),
                           tile=tile_x_size,
                           dtype=np.uint64))

            schema = tiledb.ArraySchema(
                domain=dom,
                sparse=False,
                attrs=[tiledb.Attr(name="c", dtype=np.float32)],
                ctx=ctx)
            if output is None:
                output = _input + '_result_' + ''.join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(4))  # noqa

            tiledb.DenseArray.create(output, schema)

        x = da.from_tiledb(_input, storage_options=config)
        _, h, w = x.shape
        _, tile_y_size, tile_x_size = x.chunksize

        # w and h are an exact multiple of tile size
        n_tiles_x = w // tile_x_size
        n_tiles_y = h // tile_x_size

        # manually chunk and collect
        f = []

        for y in range(n_tiles_y):
            for x in range(n_tiles_x):
                f.append(
                    client.submit(calculate_change, _input, bands,
                                  neighbourhood, x, y, tile_x_size,
                                  tile_y_size, output, config))
        client.gather(f)
        return output
    else:
        raise IndexError('CCD function requires two band indexes')
예제 #26
0
def write_tiledb(arr, path, overwrite=True):
    """Write a tiledb to disk.
    """
    if os.path.exists(path) and os.path.isdir(path) and overwrite:
        shutil.rmtree(path)

    if os.path.exists(path):
        raise FileExistsError("Output path {} already exists".format(path))

    ctx = tiledb.Ctx()

    n = arr.shape[0]
    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(ctx,
                    GENOME_DOMAIN_NAME,
                    domain=(0, n - 1),
                    tile=n_tile_extent,
                    dtype="uint32")

    if arr.ndim == 1:
        domain = tiledb.Domain(ctx, d1)

    elif arr.ndim == 2:
        m = arr.shape[1]
        d2 = tiledb.Dim(ctx,
                        SECONDARY_DOMAIN_NAME,
                        domain=(0, m - 1),
                        tile=m,
                        dtype="uint32")
        domain = tiledb.Domain(ctx, d1, d2)

    else:
        raise ValueError("tiledb backend only supports 1D or 2D arrays")

    v = tiledb.Attr(
        ctx,
        GENOME_VALUE_NAME,
        compressor=(DEFAULT_COMPRESSOR, DEFAULT_COMPRESSOR_LEVEL),
        dtype="float32",
    )

    schema = tiledb.ArraySchema(ctx,
                                domain=domain,
                                attrs=(v, ),
                                cell_order="row-major",
                                tile_order="row-major")
    A = tiledb.DenseArray.create(path, schema)

    values = arr.astype(np.float32)

    with tiledb.DenseArray(ctx, path, mode="w") as A:
        A[:] = {GENOME_VALUE_NAME: values}
예제 #27
0
def test_store_tile_db():
    ctx = tiledb.Ctx()
    tempdir = tempfile.mkdtemp()
    try:
        t = random.rand(50, 30, chunk_size=13)
        t2 = t + 1

        saved = totiledb(tempdir, t2)
        assert saved.shape == (0, 0)
        assert saved.op.tiledb_config is None
        assert saved.op.tiledb_uri == tempdir

        with pytest.raises(tiledb.TileDBError):
            tiledb.DenseArray(ctx=ctx, uri=tempdir)

        # tiledb array is created in the tile
        saved = tile(saved)

        # no error
        tiledb.DenseArray(ctx=ctx, uri=tempdir)

        # TileDB consolidation
        assert len(saved.chunks) == 1

        assert saved.chunks[0].inputs[0].op.axis_offsets == (0, 0)
        assert saved.chunks[0].inputs[1].op.axis_offsets == (0, 13)
        assert saved.chunks[0].inputs[2].op.axis_offsets == (0, 26
                                                             )  # input (0, 2)
        assert saved.chunks[0].inputs[5].op.axis_offsets == (13, 26
                                                             )  # input (1, 2)
        assert saved.chunks[0].inputs[11].op.axis_offsets == (39, 26
                                                              )  # input (3, 2)

        with pytest.raises(ValueError):
            t3 = random.rand(30, 50)
            totiledb(tempdir, t3, ctx=ctx)  # shape incompatible
    finally:
        shutil.rmtree(tempdir)
예제 #28
0
def fromtiledb(uri, ctx=None, key=None, timestamp=None, gpu=False):
    import tiledb

    raw_ctx = ctx
    if raw_ctx is None:
        ctx = tiledb.Ctx()

    # get metadata from tiledb
    try:
        tiledb_arr = tiledb.DenseArray(uri=uri,
                                       ctx=ctx,
                                       key=key,
                                       timestamp=timestamp)
        sparse = False
    except ValueError:
        # if the array is not dense, ValueError will be raised by tiledb
        tiledb_arr = tiledb.SparseArray(uri=uri,
                                        ctx=ctx,
                                        key=key,
                                        timestamp=timestamp)
        sparse = True

    if tiledb_arr.nattr > 1:
        raise NotImplementedError("Does not supported TileDB array schema "
                                  "with more than 1 attr")
    tiledb_dim_starts = tuple(
        tiledb_arr.domain.dim(j).domain[0].item()
        for j in range(tiledb_arr.ndim))
    if any(isinstance(s, float) for s in tiledb_dim_starts):
        raise ValueError("Does not support TileDB array schema "
                         "whose dimensions has float domain")

    dtype = tiledb_arr.attr(0).dtype
    tiledb_config = None if raw_ctx is None else ctx.config().dict()
    tensor_order = (TensorOrder.C_ORDER if tiledb_arr.schema.cell_order
                    == "row-major" else TensorOrder.F_ORDER)
    op = TensorTileDBDataSource(
        tiledb_config=tiledb_config,
        tiledb_uri=uri,
        tiledb_key=key,
        tiledb_timstamp=timestamp,
        tiledb_dim_starts=tiledb_dim_starts,
        gpu=gpu,
        sparse=sparse,
        dtype=dtype,
    )
    chunk_size = tuple(
        int(tiledb_arr.domain.dim(i).tile)
        for i in range(tiledb_arr.domain.ndim))
    return op(tiledb_arr.shape, chunk_size=chunk_size, order=tensor_order)
예제 #29
0
def create_test_array_dense_25x12_mult(temp_rootdir):
    """
    Create a simple dense test array.
    """
    path = os.path.abspath(os.path.join(temp_rootdir, "dense_25x12_mult"))

    ctx = tiledb.default_ctx()
    rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64)
    cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64)
    dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx)
    att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64)
    att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64)
    schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att1, att2))

    tiledb.DenseArray.create(path, schema)

    data = np.reshape(np.arange(300), (25, 12))

    with tiledb.DenseArray(path, mode="w", timestamp=1) as A:
        A[:] = {"a": data, "b": data}

    with tiledb.DenseArray(path, mode="w", timestamp=2) as A:
        A[:] = {"a": data / 2, "b": data * 2}
예제 #30
0
def convert_dataframe_to_cxg_array(cxg_container, dataframe_name, dataframe,
                                   index_column_name, ctx):
    """
    Saves the contents of the dataframe to the CXG output directory specified.

    Current access patterns are oriented toward reading very large slices of the dataframe, one attribute at a time.
    Attribute data also tends to be (often) repetitive (bools, categories, strings). Given this, we use a large tile
    size (1000) and very aggressive compression levels.
    """
    def create_dataframe_array(array_name, dataframe):
        tiledb_filter = tiledb.FilterList([
            # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and
            # other non-float data.
            tiledb.ZstdFilter(level=22),
        ])
        attrs = [
            tiledb.Attr(name=column,
                        dtype=get_dtype_of_array(dataframe[column]),
                        filters=tiledb_filter) for column in dataframe
        ]
        domain = tiledb.Domain(
            tiledb.Dim(domain=(0, dataframe.shape[0] - 1),
                       tile=min(dataframe.shape[0], 1000),
                       dtype=np.uint32))
        schema = tiledb.ArraySchema(domain=domain,
                                    sparse=False,
                                    attrs=attrs,
                                    cell_order="row-major",
                                    tile_order="row-major")
        tiledb.DenseArray.create(array_name, schema)

    array_name = f"{cxg_container}/{dataframe_name}"

    create_dataframe_array(array_name, dataframe)

    with tiledb.DenseArray(array_name, mode="w", ctx=ctx) as array:
        value = {}
        schema_hints = {}
        for column_name, column_values in dataframe.items():
            dtype, hints = get_dtype_and_schema_of_array(column_values)
            value[column_name] = column_values.to_numpy(dtype=dtype)
            if hints:
                schema_hints.update({column_name: hints})

        schema_hints.update({"index": index_column_name})
        array[:] = value
        array.meta["cxg_schema"] = json.dumps(schema_hints)

    tiledb.consolidate(array_name, ctx=ctx)