def test_dataset_create_table(tmp_path, dataset_chunks, dtype): datasets = [] names = [] datas = [] row_sum = 0 for chunks in dataset_chunks: shapes = {k: sum(c) for k, c in chunks.items()} row_sum += shapes['row'] # Make some visibilities dims = ("row", "chan", "corr") shape = tuple(shapes[d] for d in dims) data_chunks = tuple(chunks[d] for d in dims) data = da.random.random(shape, chunks=data_chunks).astype(dtype) data_var = Variable(dims, data, {}) # Make some string names dims = ("row", ) shape = tuple(shapes[d] for d in dims) str_chunks = tuple(chunks[d] for d in dims) np_str_array = np.asarray(["BOB"] * shape[0], dtype=np.object) da_str_array = da.from_array(np_str_array, chunks=str_chunks) str_array_var = Variable(dims, da_str_array, {}) datasets.append(Dataset({"DATA": data_var, "NAMES": str_array_var})) datas.append(data) names.extend(np_str_array.tolist()) freq = da.linspace(.856e9, 2 * .856e9, 64, chunks=16) sub_datasets = [Dataset({"FREQ": (("row", "chan"), freq[None, :])})] # Write the data to new tables table_name = os.path.join(str(tmp_path), 'test.table') writes = write_datasets(table_name, datasets, ["DATA", "NAMES"]) subt_writes = write_datasets(table_name + "::SPW", sub_datasets, ["FREQ"]) dask.compute(writes, subt_writes) # Check written data with pt.table(table_name, readonly=True, lockoptions='auto', ack=False) as T: assert row_sum == T.nrows() assert_array_equal(T.getcol("DATA"), np.concatenate(datas)) assert_array_equal(T.getcol("NAMES"), names) # Sub-table correctly linked and populated with pt.table(table_name + "::SPW", readonly=True, lockoptions='auto', ack=False) as T: assert T.nrows() == 1 assert_array_equal(T.getcol("FREQ")[0], freq)
def test_dataset_add_column(ms, dtype): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # Create the dask array bitflag = da.zeros_like(ds.DATA.data, dtype=dtype) # Assign keyword attribute col_kw = { "BITFLAG": { 'FLAGSETS': 'legacy,cubical', 'FLAGSET_legacy': 1, 'FLAGSET_cubical': 2 } } # Assign variable onto the dataset nds = ds.assign(BITFLAG=(("row", "chan", "corr"), bitflag)) writes = write_datasets(ms, nds, ["BITFLAG"], descriptor='ratt_ms', column_keywords=col_kw) dask.compute(writes) del datasets, ds, writes, nds assert_liveness(0, 0) with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T: bf = T.getcol("BITFLAG") assert T.getcoldesc("BITFLAG")['keywords'] == col_kw['BITFLAG'] assert bf.dtype == dtype
def test_dataset_updates(ms, select_cols, group_cols, index_cols, shapes, chunks): """ Test dataset writes """ # Get original STATE_ID and DATA with pt.table(ms, ack=False, readonly=True, lockoptions='auto') as T: original_state_id = T.getcol("STATE_ID") original_data = T.getcol("DATA") try: datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) assert_liveness(2, 1) # Test writes writes = [] states = [] datas = [] # Create write operations and execute them for i, ds in enumerate(datasets): state_var = (("row",), ds.STATE_ID.data + 1) data_var = (("row", "chan", "corr"), ds.DATA.data + 1, {}) states.append(state_var[1]) datas.append(data_var[1]) new_ds = ds.assign(STATE_ID=state_var, DATA=data_var) writes.append(write_datasets(ms, new_ds, ["STATE_ID", "DATA"])) _, states, datas = dask.compute(writes, states, datas) # NOTE(sjperkins) # Interesting behaviour here. If these objects are not # cleared up at this point, attempts to re-open the table below # can fail, reproducing https://github.com/ska-sa/dask-ms/issues/26 # Adding auto-locking to the table opening command seems to fix # this somehow del ds, new_ds, datasets, writes, state_var, data_var assert_liveness(0, 0) datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) for i, (ds, state, data) in enumerate(zip(datasets, states, datas)): assert_array_equal(ds.STATE_ID.data, state) assert_array_equal(ds.DATA.data, data) del ds, datasets assert_liveness(0, 0) finally: # Restore original STATE_ID with pt.table(ms, ack=False, readonly=False, lockoptions='auto') as T: state_id = T.getcol("STATE_ID") data = T.getcol("DATA") T.putcol("STATE_ID", original_state_id) T.putcol("DATA", original_data) # Compare against expected result assert_array_equal(original_state_id + 1, state_id) assert_array_equal(original_data + 1, data)
def test_write_dict_data(tmp_path, chunks, dtype): rs = np.random.RandomState(42) row_sum = 0 def _vis_factory(chan, corr): # Variably sized-channels per row, as in BDA data nchan = rs.randint(chan) return (rs.normal(size=(1, nchan, corr)) + rs.normal(size=(1, nchan, corr))*1j) shapes = {k: sum(c) for k, c in chunks.items()} row_sum += shapes['row'] # assert len(chunks['chan']) == 1 assert len(chunks['corr']) == 1 # Make some visibilities dims = ("row", "chan", "corr") row, chan, corr = (shapes[d] for d in dims) name = "vis-data-" + uuid.uuid4().hex nchunks = (len(chunks[d]) for d in dims) keys = product((name,), *(range(c) for c in nchunks)) chunk_sizes = product(*(chunks[d] for d in dims)) layer = {k: {'r%d' % (i + 1): _vis_factory(chan, corr) for i in range(r)} for k, (r, _, _) in zip(keys, chunk_sizes)} hlg = HighLevelGraph.from_collections(name, layer, []) chunks = tuple(chunks[d] for d in dims) meta = np.empty((0,)*len(chunks), dtype=np.complex128) vis = da.Array(hlg, name, chunks, meta=meta) ds = Dataset({"DATA": (dims, vis)}) table_name = os.path.join(str(tmp_path), 'test.table') writes, table_proxy = write_datasets(table_name, ds, ["DATA"], table_proxy=True, # No fixed shape columns descriptor="ms(False)") dask.compute(writes) data = table_proxy.getvarcol("DATA").result() # First row chunk assert_array_almost_equal(layer[(name, 0, 0, 0)]['r1'], data['r1']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r2'], data['r2']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r3'], data['r3']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r4'], data['r4']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r5'], data['r5']) # Second row chunk assert_array_almost_equal(layer[(name, 1, 0, 0)]['r1'], data['r6']) assert_array_almost_equal(layer[(name, 1, 0, 0)]['r2'], data['r7']) assert_array_almost_equal(layer[(name, 1, 0, 0)]['r3'], data['r8']) # Third row chunk assert_array_almost_equal(layer[(name, 2, 0, 0)]['r1'], data['r9']) assert_array_almost_equal(layer[(name, 2, 0, 0)]['r2'], data['r10'])
def test_stress(big_ms, iterations, chunks): datasets = read_datasets(big_ms, ["TIME", "DATA"], ["FIELD_ID", "DATA_DESC_ID"], [], chunks=chunks) assert len(datasets) == 1 ds = datasets[0] writes = [] for i in range(iterations): nds = ds.assign(TIME=(("row", ), ds.TIME.data + i), DATA=(("row", "chan", "corr"), ds.DATA.data + i)) writes.append(write_datasets(big_ms, nds, ["TIME", "DATA"])) dask.compute(writes)
def test_antenna_table_string_names(ant_table, wsrt_antenna_positions): ds = read_datasets(ant_table, [], [], None) assert len(ds) == 1 ds = ds[0] names = ["ANTENNA-%d" % i for i in range(wsrt_antenna_positions.shape[0])] assert_array_equal(ds.POSITION.data, wsrt_antenna_positions) assert_array_equal(ds.NAME.data, names) names = ds.NAME.data.compute() # Test that writing back string ndarrays work as # they must be converted from ndarrays to lists # of strings internally write_cols = set(ds.data_vars.keys()) - set(["ROWID"]) writes = write_datasets(ant_table, ds, write_cols) dask.compute(writes)
def test_dataset_add_string_column(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] dims = ds.dims name_list = ["BOB"] * dims['row'] names = np.asarray(name_list, dtype=np.object) names = da.from_array(names, chunks=ds.TIME.chunks) nds = ds.assign(NAMES=(("row", ), names)) writes = write_datasets(ms, nds, ["NAMES"]) dask.compute(writes) del datasets, ds, writes, nds assert_liveness(0, 0) with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T: assert name_list == T.getcol("NAMES")
def test_dataset_multidim_string_column(tmp_path, chunks): row = sum(chunks['row']) name_list = [["X-%d" % i, "Y-%d" % i, "Z-%d" % i] for i in range(row)] np_names = np.array(name_list, dtype=np.object) names = da.from_array(np_names, chunks=(chunks['row'], np_names.shape[1])) ds = Dataset({"POLARIZATION_TYPE": (("row", "xyz"), names)}) table_name = str(tmp_path / "test.table") writes = write_datasets(table_name, ds, ["POLARIZATION_TYPE"]) dask.compute(writes) del writes assert_liveness(0, 0) datasets = read_datasets(table_name, [], [], [], chunks={'row': chunks['row']}) assert len(datasets) == 1 assert_array_equal(datasets[0].POLARIZATION_TYPE.data, np_names) del datasets assert_liveness(0, 0)
def xds_to_table(xds, table_name, columns="ALL", descriptor=None, table_keywords=None, column_keywords=None, table_proxy=False): """ Generates a list of Datasets representing a write operations from the specified arrays in :class:`xarray.Dataset`'s into the CASA table columns specified by ``table_name`` and ``columns``. This is lazy operation -- it is only execute when a :meth:`dask.compute` or :meth:`xarray.Dataset.compute` method is called. Parameters ---------- xds : :class:`xarray.Dataset` or list of :class:`xarray.Dataset` dataset(s) containing the specified columns. If a list of datasets is provided, the concatenation of the columns in sequential datasets will be written. table_name : str CASA table path columns : tuple or list or "ALL" list of column names to write to the table. "ALL" is a special marker which specifies that all columns should be written. If you wish to write an "ALL" array to a column, use :code:`columns=['ALL']` descriptor : None or \ :class:`~daskms.descriptors.builder.AbstractBuilderFactory` or \ str A class describing how CASA table descriptors and data managers are constructors. Some defaults are available such as `ms` and `ms_subtable`. If None, defaults are used. table_keywords : dict, optional Dictionary of table keywords to add to existing keywords. The operation is performed immediately, not lazily. column_keywords : dict, optional Dictionary of :code:`{column: keywords}` to add to existing column keywords. The operation is performed immediately, not lazily. table_proxy : {False, True} If True returns the table_proxy Returns ------- write_datasets : list of :class:`xarray.Dataset` Datasets containing arrays representing write operations into a CASA Table table_proxy : :class:`daskms.TableProxy`, optional The Table Proxy associated with the datasets """ # Promote dataset to a list if not isinstance(xds, (tuple, list)): xds = [xds] if not isinstance(columns, (tuple, list)): if columns != "ALL": columns = [columns] # Write the datasets out_ds = write_datasets(table_name, xds, columns, descriptor=descriptor, table_keywords=table_keywords, column_keywords=column_keywords, table_proxy=table_proxy) # Unpack table proxy if it was requested if table_proxy is True: assert isinstance(out_ds, tuple) out_ds, tp = out_ds assert isinstance(tp, TableProxy) else: tp = None # Repack the Table Proxy if table_proxy is True: return out_ds, tp return out_ds
def xds_to_table(xds, table_name, columns, descriptor=None, table_keywords=None, column_keywords=None, table_proxy=False): """ Generates a list of Datasets representing a write operations from the specified arrays in :class:`xarray.Dataset`'s into the CASA table columns specified by ``table_name`` and ``columns``. This is lazy operation -- it is only execute when a :meth:`dask.compute` or :meth:`xarray.Dataset.compute` method is called. Parameters ---------- xds : :class:`xarray.Dataset` or list of :class:`xarray.Dataset` dataset(s) containing the specified columns. If a list of datasets is provided, the concatenation of the columns in sequential datasets will be written. table_name : str CASA table path columns : tuple or list or "ALL" list of column names to write to the table. "ALL" is a special marker which specifies that all columns should be written. If you wish to write an "ALL" array to a column, use :code:`columns=['ALL']` descriptor : None or \ :class:`~daskms.descriptors.builder.AbstractBuilderFactory` or \ str A class describing how CASA table descriptors and data managers are constructors. Some defaults are available such as `ms` and `ms_subtable`. If None, defaults are used. table_keywords : dict, optional Dictionary of table keywords to add to existing keywords. The operation is performed immediately, not lazily. column_keywords : dict, optional Dictionary of :code:`{column: keywords}` to add to existing column keywords. The operation is performed immediately, not lazily. table_proxy : {False, True} If True returns the table_proxy Returns ------- write_datasets : list of :class:`xarray.Dataset` Datasets containing arrays representing write operations into a CASA Table table_proxy : :class:`daskms.TableProxy`, optional The Table Proxy associated with the datasets """ # Promote dataset to a list if not isinstance(xds, (tuple, list)): xds = [xds] if not isinstance(columns, (tuple, list)): if columns != "ALL": columns = [columns] datasets = [] # No xarray available, assume dask datasets if xr is None: datasets = xds else: for ds in xds: if isinstance(ds, Dataset): # Already a dask dataset datasets.append(ds) elif isinstance(ds, xr.Dataset): # Produce a list of internal variable and dataset types # from the xarray Dataset variables = { k: (v.dims, v.data, v.attrs) for k, v in ds.data_vars.items() } coords = { k: (v.dims, v.data, v.attrs) for k, v in ds.coords.items() } dds = Dataset(variables, attrs=ds.attrs, coords=coords) datasets.append(dds) else: raise TypeError("Invalid Dataset type '%s'" % type(ds)) # Write the datasets out_ds = write_datasets(table_name, datasets, columns, descriptor=descriptor, table_keywords=table_keywords, column_keywords=column_keywords, table_proxy=table_proxy) # No xarray available assume dask datasets if xr is None: return out_ds # Unpack table proxy if it was requested if table_proxy is True: assert isinstance(out_ds, tuple) out_ds, tp = out_ds assert isinstance(tp, TableProxy) else: tp = None if isinstance(out_ds, Dataset): out_ds = [out_ds] elif isinstance(out_ds, (tuple, list)): pass else: raise TypeError("Invalid Dataset type '%s'" % type(out_ds)) xformed_out_ds = [] for ds in out_ds: assert isinstance(ds, Dataset) variables = { k: (v.dims, v.data, v.attrs) for k, v in ds.data_vars.items() } coords = {k: (v.dims, v.data, v.attrs) for k, v in ds.coords.items()} xformed_out_ds.append( xr.Dataset(variables, coords=coords, attrs=ds.attrs)) if len(xformed_out_ds) == 0: return xr.Dataset() elif len(xformed_out_ds) == 1: xformed_out_ds = xformed_out_ds[0] # Repack the Table Proxy if table_proxy is True: return xformed_out_ds, tp return xformed_out_ds