예제 #1
0
def test_dataset_create_table(tmp_path, dataset_chunks, dtype):
    datasets = []
    names = []
    datas = []
    row_sum = 0

    for chunks in dataset_chunks:
        shapes = {k: sum(c) for k, c in chunks.items()}
        row_sum += shapes['row']

        # Make some visibilities
        dims = ("row", "chan", "corr")
        shape = tuple(shapes[d] for d in dims)
        data_chunks = tuple(chunks[d] for d in dims)
        data = da.random.random(shape, chunks=data_chunks).astype(dtype)
        data_var = Variable(dims, data, {})

        # Make some string names
        dims = ("row", )
        shape = tuple(shapes[d] for d in dims)
        str_chunks = tuple(chunks[d] for d in dims)
        np_str_array = np.asarray(["BOB"] * shape[0], dtype=np.object)
        da_str_array = da.from_array(np_str_array, chunks=str_chunks)
        str_array_var = Variable(dims, da_str_array, {})

        datasets.append(Dataset({"DATA": data_var, "NAMES": str_array_var}))
        datas.append(data)
        names.extend(np_str_array.tolist())

    freq = da.linspace(.856e9, 2 * .856e9, 64, chunks=16)
    sub_datasets = [Dataset({"FREQ": (("row", "chan"), freq[None, :])})]

    # Write the data to new tables
    table_name = os.path.join(str(tmp_path), 'test.table')
    writes = write_datasets(table_name, datasets, ["DATA", "NAMES"])
    subt_writes = write_datasets(table_name + "::SPW", sub_datasets, ["FREQ"])
    dask.compute(writes, subt_writes)

    # Check written data
    with pt.table(table_name, readonly=True, lockoptions='auto',
                  ack=False) as T:
        assert row_sum == T.nrows()
        assert_array_equal(T.getcol("DATA"), np.concatenate(datas))
        assert_array_equal(T.getcol("NAMES"), names)

    # Sub-table correctly linked and populated
    with pt.table(table_name + "::SPW",
                  readonly=True,
                  lockoptions='auto',
                  ack=False) as T:
        assert T.nrows() == 1
        assert_array_equal(T.getcol("FREQ")[0], freq)
예제 #2
0
def test_dataset_add_column(ms, dtype):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    # Create the dask array
    bitflag = da.zeros_like(ds.DATA.data, dtype=dtype)
    # Assign keyword attribute
    col_kw = {
        "BITFLAG": {
            'FLAGSETS': 'legacy,cubical',
            'FLAGSET_legacy': 1,
            'FLAGSET_cubical': 2
        }
    }
    # Assign variable onto the dataset
    nds = ds.assign(BITFLAG=(("row", "chan", "corr"), bitflag))
    writes = write_datasets(ms,
                            nds, ["BITFLAG"],
                            descriptor='ratt_ms',
                            column_keywords=col_kw)

    dask.compute(writes)

    del datasets, ds, writes, nds
    assert_liveness(0, 0)

    with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T:
        bf = T.getcol("BITFLAG")
        assert T.getcoldesc("BITFLAG")['keywords'] == col_kw['BITFLAG']
        assert bf.dtype == dtype
예제 #3
0
def test_dataset_updates(ms, select_cols,
                         group_cols, index_cols,
                         shapes, chunks):
    """ Test dataset writes """

    # Get original STATE_ID and DATA
    with pt.table(ms, ack=False, readonly=True, lockoptions='auto') as T:
        original_state_id = T.getcol("STATE_ID")
        original_data = T.getcol("DATA")

    try:
        datasets = read_datasets(ms, select_cols, group_cols,
                                 index_cols, chunks=chunks)
        assert_liveness(2, 1)

        # Test writes
        writes = []
        states = []
        datas = []

        # Create write operations and execute them
        for i, ds in enumerate(datasets):
            state_var = (("row",), ds.STATE_ID.data + 1)
            data_var = (("row", "chan", "corr"), ds.DATA.data + 1, {})
            states.append(state_var[1])
            datas.append(data_var[1])
            new_ds = ds.assign(STATE_ID=state_var, DATA=data_var)
            writes.append(write_datasets(ms, new_ds, ["STATE_ID", "DATA"]))

        _, states, datas = dask.compute(writes, states, datas)

        # NOTE(sjperkins)
        # Interesting behaviour here. If these objects are not
        # cleared up at this point, attempts to re-open the table below
        # can fail, reproducing https://github.com/ska-sa/dask-ms/issues/26
        # Adding auto-locking to the table opening command seems to fix
        # this somehow
        del ds, new_ds, datasets, writes, state_var, data_var
        assert_liveness(0, 0)

        datasets = read_datasets(ms, select_cols, group_cols,
                                 index_cols, chunks=chunks)

        for i, (ds, state, data) in enumerate(zip(datasets, states, datas)):
            assert_array_equal(ds.STATE_ID.data, state)
            assert_array_equal(ds.DATA.data, data)

        del ds, datasets
        assert_liveness(0, 0)
    finally:
        # Restore original STATE_ID
        with pt.table(ms, ack=False, readonly=False, lockoptions='auto') as T:
            state_id = T.getcol("STATE_ID")
            data = T.getcol("DATA")
            T.putcol("STATE_ID", original_state_id)
            T.putcol("DATA", original_data)

    # Compare against expected result
    assert_array_equal(original_state_id + 1, state_id)
    assert_array_equal(original_data + 1, data)
예제 #4
0
def test_write_dict_data(tmp_path, chunks, dtype):
    rs = np.random.RandomState(42)
    row_sum = 0

    def _vis_factory(chan, corr):
        # Variably sized-channels per row, as in BDA data
        nchan = rs.randint(chan)
        return (rs.normal(size=(1, nchan, corr)) +
                rs.normal(size=(1, nchan, corr))*1j)

    shapes = {k: sum(c) for k, c in chunks.items()}
    row_sum += shapes['row']

    # assert len(chunks['chan']) == 1
    assert len(chunks['corr']) == 1

    # Make some visibilities
    dims = ("row", "chan", "corr")
    row, chan, corr = (shapes[d] for d in dims)
    name = "vis-data-" + uuid.uuid4().hex

    nchunks = (len(chunks[d]) for d in dims)
    keys = product((name,), *(range(c) for c in nchunks))
    chunk_sizes = product(*(chunks[d] for d in dims))

    layer = {k: {'r%d' % (i + 1): _vis_factory(chan, corr)
                 for i in range(r)}
             for k, (r, _, _) in zip(keys, chunk_sizes)}

    hlg = HighLevelGraph.from_collections(name, layer, [])
    chunks = tuple(chunks[d] for d in dims)
    meta = np.empty((0,)*len(chunks), dtype=np.complex128)
    vis = da.Array(hlg, name, chunks, meta=meta)
    ds = Dataset({"DATA": (dims, vis)})

    table_name = os.path.join(str(tmp_path), 'test.table')
    writes, table_proxy = write_datasets(table_name, ds, ["DATA"],
                                         table_proxy=True,
                                         # No fixed shape columns
                                         descriptor="ms(False)")

    dask.compute(writes)

    data = table_proxy.getvarcol("DATA").result()

    # First row chunk
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r1'], data['r1'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r2'], data['r2'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r3'], data['r3'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r4'], data['r4'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r5'], data['r5'])

    # Second row chunk
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r1'], data['r6'])
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r2'], data['r7'])
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r3'], data['r8'])

    # Third row chunk
    assert_array_almost_equal(layer[(name, 2, 0, 0)]['r1'], data['r9'])
    assert_array_almost_equal(layer[(name, 2, 0, 0)]['r2'], data['r10'])
예제 #5
0
def test_stress(big_ms, iterations, chunks):
    datasets = read_datasets(big_ms, ["TIME", "DATA"],
                             ["FIELD_ID", "DATA_DESC_ID"], [],
                             chunks=chunks)

    assert len(datasets) == 1
    ds = datasets[0]

    writes = []

    for i in range(iterations):
        nds = ds.assign(TIME=(("row", ), ds.TIME.data + i),
                        DATA=(("row", "chan", "corr"), ds.DATA.data + i))
        writes.append(write_datasets(big_ms, nds, ["TIME", "DATA"]))

    dask.compute(writes)
예제 #6
0
def test_antenna_table_string_names(ant_table, wsrt_antenna_positions):
    ds = read_datasets(ant_table, [], [], None)
    assert len(ds) == 1
    ds = ds[0]

    names = ["ANTENNA-%d" % i for i in range(wsrt_antenna_positions.shape[0])]

    assert_array_equal(ds.POSITION.data, wsrt_antenna_positions)
    assert_array_equal(ds.NAME.data, names)

    names = ds.NAME.data.compute()

    # Test that writing back string ndarrays work as
    # they must be converted from ndarrays to lists
    # of strings internally
    write_cols = set(ds.data_vars.keys()) - set(["ROWID"])
    writes = write_datasets(ant_table, ds, write_cols)

    dask.compute(writes)
예제 #7
0
def test_dataset_add_string_column(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]
    dims = ds.dims

    name_list = ["BOB"] * dims['row']
    names = np.asarray(name_list, dtype=np.object)
    names = da.from_array(names, chunks=ds.TIME.chunks)

    nds = ds.assign(NAMES=(("row", ), names))

    writes = write_datasets(ms, nds, ["NAMES"])
    dask.compute(writes)

    del datasets, ds, writes, nds
    assert_liveness(0, 0)

    with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T:
        assert name_list == T.getcol("NAMES")
예제 #8
0
def test_dataset_multidim_string_column(tmp_path, chunks):
    row = sum(chunks['row'])

    name_list = [["X-%d" % i, "Y-%d" % i, "Z-%d" % i] for i in range(row)]
    np_names = np.array(name_list, dtype=np.object)
    names = da.from_array(np_names, chunks=(chunks['row'], np_names.shape[1]))

    ds = Dataset({"POLARIZATION_TYPE": (("row", "xyz"), names)})
    table_name = str(tmp_path / "test.table")
    writes = write_datasets(table_name, ds, ["POLARIZATION_TYPE"])
    dask.compute(writes)

    del writes
    assert_liveness(0, 0)

    datasets = read_datasets(table_name, [], [], [],
                             chunks={'row': chunks['row']})
    assert len(datasets) == 1
    assert_array_equal(datasets[0].POLARIZATION_TYPE.data, np_names)
    del datasets
    assert_liveness(0, 0)
예제 #9
0
def xds_to_table(xds,
                 table_name,
                 columns="ALL",
                 descriptor=None,
                 table_keywords=None,
                 column_keywords=None,
                 table_proxy=False):
    """
    Generates a list of Datasets representing a write operations from the
    specified arrays in :class:`xarray.Dataset`'s into
    the CASA table columns specified by ``table_name`` and ``columns``.
    This is lazy operation -- it is only execute when a :meth:`dask.compute`
    or :meth:`xarray.Dataset.compute` method is called.

    Parameters
    ----------
    xds : :class:`xarray.Dataset` or list of :class:`xarray.Dataset`
        dataset(s) containing the specified columns. If a list of datasets
        is provided, the concatenation of the columns in
        sequential datasets will be written.

    table_name : str
        CASA table path

    columns : tuple or list or "ALL"
        list of column names to write to the table.

        "ALL" is a special marker which specifies that all columns
        should be written. If you wish to write an "ALL" array to
        a column, use :code:`columns=['ALL']`

    descriptor : None or \
        :class:`~daskms.descriptors.builder.AbstractBuilderFactory` or \
        str

        A class describing how CASA table descriptors and data managers
        are constructors. Some defaults are available such
        as `ms` and `ms_subtable`.

        If None, defaults are used.

    table_keywords : dict, optional
        Dictionary of table keywords to add to existing keywords.
        The operation is performed immediately, not lazily.

    column_keywords : dict, optional
        Dictionary of :code:`{column: keywords}` to add to existing
        column keywords.
        The operation is performed immediately, not lazily.

    table_proxy : {False, True}
        If True returns the table_proxy

    Returns
    -------
    write_datasets : list of :class:`xarray.Dataset`
        Datasets containing arrays representing write operations
        into a CASA Table
    table_proxy : :class:`daskms.TableProxy`, optional
        The Table Proxy associated with the datasets
    """

    # Promote dataset to a list
    if not isinstance(xds, (tuple, list)):
        xds = [xds]

    if not isinstance(columns, (tuple, list)):
        if columns != "ALL":
            columns = [columns]

    # Write the datasets
    out_ds = write_datasets(table_name,
                            xds,
                            columns,
                            descriptor=descriptor,
                            table_keywords=table_keywords,
                            column_keywords=column_keywords,
                            table_proxy=table_proxy)

    # Unpack table proxy if it was requested
    if table_proxy is True:
        assert isinstance(out_ds, tuple)
        out_ds, tp = out_ds
        assert isinstance(tp, TableProxy)
    else:
        tp = None

    # Repack the Table Proxy
    if table_proxy is True:
        return out_ds, tp

    return out_ds
예제 #10
0
파일: dask_ms.py 프로젝트: smasoka/dask-ms
def xds_to_table(xds,
                 table_name,
                 columns,
                 descriptor=None,
                 table_keywords=None,
                 column_keywords=None,
                 table_proxy=False):
    """
    Generates a list of Datasets representing a write operations from the
    specified arrays in :class:`xarray.Dataset`'s into
    the CASA table columns specified by ``table_name`` and ``columns``.
    This is lazy operation -- it is only execute when a :meth:`dask.compute`
    or :meth:`xarray.Dataset.compute` method is called.

    Parameters
    ----------
    xds : :class:`xarray.Dataset` or list of :class:`xarray.Dataset`
        dataset(s) containing the specified columns. If a list of datasets
        is provided, the concatenation of the columns in
        sequential datasets will be written.

    table_name : str
        CASA table path

    columns : tuple or list or "ALL"
        list of column names to write to the table.

        "ALL" is a special marker which specifies that all columns
        should be written. If you wish to write an "ALL" array to
        a column, use :code:`columns=['ALL']`

    descriptor : None or \
        :class:`~daskms.descriptors.builder.AbstractBuilderFactory` or \
        str

        A class describing how CASA table descriptors and data managers
        are constructors. Some defaults are available such
        as `ms` and `ms_subtable`.

        If None, defaults are used.

    table_keywords : dict, optional
        Dictionary of table keywords to add to existing keywords.
        The operation is performed immediately, not lazily.

    column_keywords : dict, optional
        Dictionary of :code:`{column: keywords}` to add to existing
        column keywords.
        The operation is performed immediately, not lazily.

    table_proxy : {False, True}
        If True returns the table_proxy

    Returns
    -------
    write_datasets : list of :class:`xarray.Dataset`
        Datasets containing arrays representing write operations
        into a CASA Table
    table_proxy : :class:`daskms.TableProxy`, optional
        The Table Proxy associated with the datasets
    """

    # Promote dataset to a list
    if not isinstance(xds, (tuple, list)):
        xds = [xds]

    if not isinstance(columns, (tuple, list)):
        if columns != "ALL":
            columns = [columns]

    datasets = []

    # No xarray available, assume dask datasets
    if xr is None:
        datasets = xds
    else:
        for ds in xds:
            if isinstance(ds, Dataset):
                # Already a dask dataset
                datasets.append(ds)
            elif isinstance(ds, xr.Dataset):
                # Produce a list of internal variable and dataset types
                # from the xarray Dataset
                variables = {
                    k: (v.dims, v.data, v.attrs)
                    for k, v in ds.data_vars.items()
                }

                coords = {
                    k: (v.dims, v.data, v.attrs)
                    for k, v in ds.coords.items()
                }

                dds = Dataset(variables, attrs=ds.attrs, coords=coords)
                datasets.append(dds)
            else:
                raise TypeError("Invalid Dataset type '%s'" % type(ds))

    # Write the datasets
    out_ds = write_datasets(table_name,
                            datasets,
                            columns,
                            descriptor=descriptor,
                            table_keywords=table_keywords,
                            column_keywords=column_keywords,
                            table_proxy=table_proxy)

    # No xarray available assume dask datasets
    if xr is None:
        return out_ds

    # Unpack table proxy if it was requested
    if table_proxy is True:
        assert isinstance(out_ds, tuple)
        out_ds, tp = out_ds
        assert isinstance(tp, TableProxy)
    else:
        tp = None

    if isinstance(out_ds, Dataset):
        out_ds = [out_ds]
    elif isinstance(out_ds, (tuple, list)):
        pass
    else:
        raise TypeError("Invalid Dataset type '%s'" % type(out_ds))

    xformed_out_ds = []

    for ds in out_ds:
        assert isinstance(ds, Dataset)

        variables = {
            k: (v.dims, v.data, v.attrs)
            for k, v in ds.data_vars.items()
        }

        coords = {k: (v.dims, v.data, v.attrs) for k, v in ds.coords.items()}

        xformed_out_ds.append(
            xr.Dataset(variables, coords=coords, attrs=ds.attrs))

    if len(xformed_out_ds) == 0:
        return xr.Dataset()
    elif len(xformed_out_ds) == 1:
        xformed_out_ds = xformed_out_ds[0]

    # Repack the Table Proxy
    if table_proxy is True:
        return xformed_out_ds, tp

    return xformed_out_ds