예제 #1
0
def test_dataset_create_table(tmp_path, dataset_chunks, dtype):
    datasets = []
    names = []
    datas = []
    row_sum = 0

    for chunks in dataset_chunks:
        shapes = {k: sum(c) for k, c in chunks.items()}
        row_sum += shapes['row']

        # Make some visibilities
        dims = ("row", "chan", "corr")
        shape = tuple(shapes[d] for d in dims)
        data_chunks = tuple(chunks[d] for d in dims)
        data = da.random.random(shape, chunks=data_chunks).astype(dtype)
        data_var = Variable(dims, data, {})

        # Make some string names
        dims = ("row", )
        shape = tuple(shapes[d] for d in dims)
        str_chunks = tuple(chunks[d] for d in dims)
        np_str_array = np.asarray(["BOB"] * shape[0], dtype=np.object)
        da_str_array = da.from_array(np_str_array, chunks=str_chunks)
        str_array_var = Variable(dims, da_str_array, {})

        datasets.append(Dataset({"DATA": data_var, "NAMES": str_array_var}))
        datas.append(data)
        names.extend(np_str_array.tolist())

    freq = da.linspace(.856e9, 2 * .856e9, 64, chunks=16)
    sub_datasets = [Dataset({"FREQ": (("row", "chan"), freq[None, :])})]

    # Write the data to new tables
    table_name = os.path.join(str(tmp_path), 'test.table')
    writes = write_datasets(table_name, datasets, ["DATA", "NAMES"])
    subt_writes = write_datasets(table_name + "::SPW", sub_datasets, ["FREQ"])
    dask.compute(writes, subt_writes)

    # Check written data
    with pt.table(table_name, readonly=True, lockoptions='auto',
                  ack=False) as T:
        assert row_sum == T.nrows()
        assert_array_equal(T.getcol("DATA"), np.concatenate(datas))
        assert_array_equal(T.getcol("NAMES"), names)

    # Sub-table correctly linked and populated
    with pt.table(table_name + "::SPW",
                  readonly=True,
                  lockoptions='auto',
                  ack=False) as T:
        assert T.nrows() == 1
        assert_array_equal(T.getcol("FREQ")[0], freq)
예제 #2
0
def test_variable_column_descriptor(chunks, dtype, tmp_path):
    column_meta = []
    shapes = {k: sum(c) for k, c in chunks.items()}

    # Make some visibilities
    dims = ("row", "chan", "corr")
    shape = tuple(shapes[d] for d in dims)
    data_chunks = tuple(chunks[d] for d in dims)
    data = da.random.random(shape, chunks=data_chunks).astype(dtype)
    data_var = Variable(dims, data, {})
    meta = variable_column_descriptor("DATA", data_var)
    column_meta.append({"name": "DATA", "desc": meta})

    # Make some string names
    dims = ("row", )
    shape = tuple(shapes[d] for d in dims)
    str_chunks = tuple(chunks[d] for d in dims)
    np_str_array = np.asarray(["BOB"] * shape[0], dtype=np.object)
    da_str_array = da.from_array(np_str_array, chunks=str_chunks)
    str_array_var = Variable(dims, da_str_array, {})
    meta = variable_column_descriptor("NAMES", str_array_var)
    column_meta.append({"name": "NAMES", "desc": meta})

    # Create a new table with the column metadata
    fn = os.path.join(str(tmp_path), "test.ms")
    tabdesc = pt.maketabdesc(column_meta)

    with pt.table(fn, tabdesc, readonly=False, ack=False) as T:
        # Add rows
        T.addrows(shapes['row'])

        str_list = np_str_array.tolist()

        # Put data
        T.putcol("DATA", data.compute())
        T.putcol("NAMES", str_list)

        # We get out what we put in
        assert_array_equal(T.getcol("NAMES"), str_list)
        assert_array_equal(T.getcol("DATA"), data)
예제 #3
0
def test_ms_subtable_builder(tmp_path, table):
    A = da.zeros((10, 20, 30), chunks=(2, 20, 30), dtype=np.int32)
    variables = {"FOO": Variable(("row", "chan", "corr"), A, {})}
    var_names = set(variables.keys())

    builder = MSSubTableDescriptorBuilder(table)
    default_desc = builder.default_descriptor()
    tab_desc = builder.descriptor(variables, default_desc)
    dminfo = builder.dminfo(tab_desc)

    # These columns must always be present on an MS
    required_cols = {
        k
        for k in pt.required_ms_desc(table).keys() if not k.startswith('_')
    }

    filename = str(tmp_path / (f"{table}.table"))

    with pt.table(filename, tab_desc, dminfo=dminfo, ack=False) as T:
        T.addrows(10)

        # We got required + the extra columns we asked for
        assert set(T.colnames()) == set.union(var_names, required_cols)
예제 #4
0
 def _variable_factory(dims, dtype):
     shape = tuple(sum(chunks[d]) for d in dims)
     achunks = tuple(chunks[d] for d in dims)
     dask_array = da.random.random(shape, chunks=achunks).astype(dtype)
     return Variable(dims, dask_array, {})
예제 #5
0
def xds_from_zarr(store, columns=None, chunks=None, **kwargs):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset
    **kwargs: optional

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """

    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (Path, str)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_from_zarr: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY
    # expensive if the metadata has not been consolidated.
    zc.consolidate_metadata(store.map)
    table_path = store.table if store.table else "MAIN"
    table_group = zarr.open_consolidated(store.map)[table_path]

    for g, (group_name,
            group) in enumerate(sorted(table_group.groups(),
                                       key=group_sortkey)):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                group_chunks.update(chunks[-1])  # Reuse last chunking.
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets
예제 #6
0
def xds_from_zarr(store, columns=None, chunks=None):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """
    store, table = store_path_split(store)

    if isinstance(store, Path):
        store = str(store)

    if not isinstance(store, str):
        raise TypeError("store must be a Path, str")

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    table_group = zarr.open(store)[table]

    for g, (group_name, group) in enumerate(sorted(table_group.groups())):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets