def test_multiprocess_table(ms, nprocs): import time import threading import dask.threaded as dt # Don't fork threads # https://rachelbythebay.com/w/2011/06/07/forked/ # Close and cleanup default dask threadpools with dt.pools_lock: if dt.default_pool is not None: dt.default_pool.close() dt.default_pool = None for thread in list(dt.pools.keys()): for p in dt.pools.pop(thread).values(): p.close() # No TableProxies or Executors (with ThreadPools) live assert_liveness(0, 0) # Wait for other threads to die time.sleep(0.1) # Only main thread is alive assert len(threading.enumerate()) == 1 from multiprocessing import Pool pool = Pool(nprocs) try: args = [tuple((ms, i)) for i in range(nprocs)] assert all(pool.map(_proc_map_fn, args)) finally: pool.close()
def test_dataset_add_column(ms, dtype): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # Create the dask array bitflag = da.zeros_like(ds.DATA.data, dtype=dtype) # Assign keyword attribute col_kw = { "BITFLAG": { 'FLAGSETS': 'legacy,cubical', 'FLAGSET_legacy': 1, 'FLAGSET_cubical': 2 } } # Assign variable onto the dataset nds = ds.assign(BITFLAG=(("row", "chan", "corr"), bitflag)) writes = write_datasets(ms, nds, ["BITFLAG"], descriptor='ratt_ms', column_keywords=col_kw) dask.compute(writes) del datasets, ds, writes, nds assert_liveness(0, 0) with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T: bf = T.getcol("BITFLAG") assert T.getcoldesc("BITFLAG")['keywords'] == col_kw['BITFLAG'] assert bf.dtype == dtype
def test_embedding_table_proxy_in_taql(ms, reverse): """ Test using a TableProxy to create a TAQL TableProxy """ proxy = TableProxy(pt.table, ms, ack=False, readonly=True) query = "SELECT UNIQUE ANTENNA1 FROM $1" taql_proxy = TableProxy(taql_factory, query, tables=[proxy]) assert_array_equal(taql_proxy.getcol("ANTENNA1").result(), [0, 1, 2]) # TAQL and original table assert_liveness(2, 1) if reverse: del proxy # TAQL still references original table assert_liveness(2, 1) # Remove TAQL now results in everything clearing up del taql_proxy assert_liveness(0, 0) else: # Removing TAQL should leave original table del taql_proxy assert_liveness(1, 1) # Removing proxy removes the last del proxy assert_liveness(0, 0)
def test_taql_proxy_pickling(ms): """ Test taql pickling """ proxy = TableProxy(pt.taql, f"SELECT UNIQUE ANTENNA1 FROM '{ms}'") proxy2 = pickle.loads(pickle.dumps(proxy)) assert_liveness(1, 1) assert proxy is proxy2 assert tokenize(proxy) == tokenize(proxy2) del proxy, proxy2 assert_liveness(0, 0)
def test_multiprocess_table(ms, nprocs): # Check here so that we don't fork threads # https://rachelbythebay.com/w/2011/06/07/forked/ assert_liveness(0, 0) from multiprocessing import Pool pool = Pool(nprocs) try: args = [tuple((ms, i)) for i in range(nprocs)] assert all(pool.map(_proc_map_fn, args)) finally: pool.close()
def test_table_proxy_pickling(ms): """ Test table pickling """ proxy = TableProxy(pt.table, ms, ack=False, readonly=False) proxy2 = pickle.loads(pickle.dumps(proxy)) assert_liveness(1, 1) # Same object and tokens assert proxy is proxy2 assert tokenize(proxy) == tokenize(proxy2) del proxy, proxy2 assert_liveness(0, 0)
def test_row_grouping(spw_table, spw_chan_freqs, chunks): """ Test grouping on single rows """ datasets = read_datasets(spw_table, [], ["__row__"], [], chunks=chunks) assert_liveness(2, 1) assert len(datasets) == len(spw_chan_freqs) for i, chan_freq in enumerate(spw_chan_freqs): assert_array_equal(datasets[i].CHAN_FREQ.data[0], chan_freq) assert_array_equal(datasets[i].NUM_CHAN.data[0], chan_freq.shape[0]) del datasets assert_liveness(0, 0)
def test_ordering_multiple_groups(ms, group_cols, index_cols): group_taql = group_ordering_taql(table_proxy(ms), group_cols, index_cols) assert_liveness(2, 1) orders = group_row_ordering(group_taql, group_cols, index_cols, [{ 'row': 2 }]) assert_liveness(2, 1) first_rows = group_taql.getcol("__firstrow__").result() assert_liveness(2, 1) assert len(first_rows) == len(orders) == 6 assert_array_equal(first_rows, [0, 1, 3, 4, 7, 8]) rowid_arrays = tuple(o[0] for o in orders) rowids = dask.compute(rowid_arrays)[0] assert_array_equal(rowids[0], [2, 0]) assert_array_equal(rowids[1], [1]) assert_array_equal(rowids[2], [5, 3]) assert_array_equal(rowids[3], [6, 4]) assert_array_equal(rowids[4], [9, 7]) assert_array_equal(rowids[5], [8]) del first_rows, orders, rowid_arrays, group_taql assert_liveness(0, 0)
def test_row_ordering_multiple_groups(ms, group_cols, index_cols, chunks): group_taql = group_ordering_taql(table_proxy(ms), group_cols, index_cols) assert_liveness(2, 1) orders = group_row_ordering(group_taql, group_cols, index_cols, chunks) assert_liveness(2, 1) first_rows = group_taql.getcol("__firstrow__").result() assert_liveness(2, 1) # We get two groups out assert len(orders) == len(first_rows) == 2 assert_array_equal(first_rows, [0, 7]) rowid_arrays = tuple(o[0] for o in orders) rowids = dask.compute(rowid_arrays)[0] # Check the two resulting groups # Normalise chunks to match that of the output array row_chunks = chunks[0]['row'] expected_chunks = da.core.normalize_chunks(row_chunks, (7, )) assert_array_equal(rowids[0], [6, 5, 4, 3, 2, 1, 0]) assert rowid_arrays[0].chunks == expected_chunks # If chunks only supplied for the first group, re-use it's chunking row_chunks = chunks[0]['row'] if len(chunks) == 1 else chunks[1]['row'] expected_chunks = da.core.normalize_chunks(row_chunks, (3, )) assert_array_equal(rowids[1], [9, 8, 7]) assert rowid_arrays[1].chunks == expected_chunks del first_rows, orders, rowid_arrays, group_taql assert_liveness(0, 0)
def test_table_proxy(ms): """ Base table proxy test """ tp = TableProxy(pt.table, ms, ack=False, readonly=False) tq = TableProxy(pt.taql, f"SELECT UNIQUE ANTENNA1 FROM '{ms}'") assert_liveness(2, 1) assert tp.nrows().result() == 10 assert tq.nrows().result() == 3 # Different tokens assert tokenize(tp) != tokenize(tq) del tp, tq assert_liveness(0, 0)
def test_dataset_assign(ms): """ Test dataset assignment """ datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # Assign on an existing column is easier because we can # infer the dimension schema from it nds = ds.assign(TIME=(ds.TIME.dims, ds.TIME.data + 1)) assert ds.DATA.data is nds.DATA.data assert ds.TIME.data is not nds.TIME.data assert_array_equal(nds.TIME.data, ds.TIME.data + 1) # We have to explicitly supply a dimension schema nds = ds.assign(ANTENNA3=(("row", ), ds.ANTENNA1.data + 3)) assert_array_equal(ds.ANTENNA1.data + 3, nds.ANTENNA3.data) dims = ds.dims chunks = ds.chunks if have_xarray: match = "'row': length 9 on 'ANTENNA4'" else: match = ("Existing dimension size 9 for dimension 'row' " "is inconsistent with same dimension 10 of array ANTENNA4") with pytest.raises(ValueError, match=match): array = da.zeros(dims['row'] - 1, chunks['row']) nds = ds.assign(ANTENNA4=(("row", ), array)) nds.dims assert chunks['row'] == (10, ) if have_xarray: match = "Object has inconsistent chunks along dimension row." else: match = r"chunking \(4, 4, 2\) for dim" with pytest.raises(ValueError, match=match): array = da.zeros(dims['row'], chunks=4) nds = ds.assign(ANTENNA4=(("row", ), array)) nds.chunks del datasets, ds, nds assert_liveness(0, 0)
def test_column_metadata(ms, column, shape, chunks, table_schema, dtype): table_proxy = TableProxy(pt.table, ms, readonly=True, ack=False) assert_liveness(1, 1) try: dims = table_schema[column]['dims'] except KeyError: dims = tuple("%s-%d" % (column, i) for i in range(1, len(shape) + 1)) meta = column_metadata(column, table_proxy, table_schema, dict(chunks)) assert meta.shape == shape assert meta.dims == dims assert meta.chunks == [c[1] for c in chunks[:len(meta.shape)]] assert meta.dtype == dtype del table_proxy assert_liveness(0, 0)
def test_dataset(ms, select_cols, group_cols, index_cols, shapes, chunks): """ Test dataset creation """ datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) # (1) Read-only TableProxy # (2) Read-only TAQL TableProxy assert_liveness(2, 1) chans = shapes['chan'] corrs = shapes['corr'] # Expected output chunks echunks = { 'chan': normalize_chunks(chunks.get('chan', chans), shape=(chans, ))[0], 'corr': normalize_chunks(chunks.get('corr', corrs), shape=(corrs, ))[0] } for ds in datasets: compute_dict = {} for k, v in ds.data_vars.items(): compute_dict[k] = v.data assert v.dtype == v.data.dtype res = dask.compute(compute_dict)[0] assert res['DATA'].shape[1:] == (chans, corrs) assert 'STATE_ID' in res assert 'TIME' in res chunks = ds.chunks assert chunks["chan"] == echunks['chan'] assert chunks["corr"] == echunks['corr'] dims = ds.dims dims.pop('row') # row changes assert dims == {"chan": shapes['chan'], "corr": shapes['corr']} del ds, datasets, compute_dict, v assert_liveness(0, 0)
def test_dataset_add_string_column(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] dims = ds.dims name_list = ["BOB"] * dims['row'] names = np.asarray(name_list, dtype=np.object) names = da.from_array(names, chunks=ds.TIME.chunks) nds = ds.assign(NAMES=(("row", ), names)) writes = write_datasets(ms, nds, ["NAMES"]) dask.compute(writes) del datasets, ds, writes, nds assert_liveness(0, 0) with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T: assert name_list == T.getcol("NAMES")
def test_proxy_finalization(tmpdir_factory, epochs, iterations): """ Test that we can create many TableProxy objects associated with multiple Executors in multiple threads, get some data and that they, as well as their associated executor are correctly finalized """ data_path = tmpdir_factory.mktemp('data') ascii_desc = data_path.join('ascii.txt') with open(str(ascii_desc), 'w') as f: f.write(ASCII_TABLE) futures = [] def _getcol(tp, column): return tp.result().getcol(column) with cf.ThreadPoolExecutor(8) as pool: for e in range(epochs): # Iteration for i in range(iterations): path = data_path.join("CASA-%d-%d.table" % (e, i)) tab_fut = pool.submit(TableProxy, pt.tablefromascii, str(path), str(ascii_desc), ack=False, __executor_key__="epoch-%d" % i) data = pool.submit(_getcol, tab_fut, "DATA") u = pool.submit(_getcol, tab_fut, "U") futures.append(data) futures.append(u) futures, _ = cf.wait(futures) del futures, data, u, tab_fut assert_liveness(0, 0)
def test_proxy_dask_embedding(ms): """ Test that an embedded proxy in the graph stays alive and dies at the appropriate times """ def _ant1_factory(ms): proxy = TableProxy(pt.table, ms, ack=False, readonly=False) nrows = proxy.nrows().result() name = 'ant1' row_chunk = 2 layers = {} chunks = [] for c, sr in enumerate(range(0, nrows, row_chunk)): er = min(sr + row_chunk, nrows) chunk_size = er - sr chunks.append(chunk_size) layers[(name, c)] = (proxy.getcol, "ANTENNA1", sr, chunk_size) # Create array graph = HighLevelGraph.from_collections(name, layers, []) ant1 = da.Array(graph, name, (tuple(chunks), ), dtype=np.int32) # Evaluate futures return ant1.map_blocks(lambda f: f.result(), dtype=ant1.dtype) ant1 = _ant1_factory(ms) # Proxy and executor's are embedded in the graph assert_liveness(1, 1) a1 = ant1.compute() with pt.table(ms, readonly=False, ack=False) as T: assert_array_equal(a1, T.getcol("ANTENNA1")) # Delete the graph del ant1 # Cache's are now clear assert_liveness(0, 0)
def test_dataset_multidim_string_column(tmp_path, chunks): row = sum(chunks['row']) name_list = [["X-%d" % i, "Y-%d" % i, "Z-%d" % i] for i in range(row)] np_names = np.array(name_list, dtype=np.object) names = da.from_array(np_names, chunks=(chunks['row'], np_names.shape[1])) ds = Dataset({"POLARIZATION_TYPE": (("row", "xyz"), names)}) table_name = str(tmp_path / "test.table") writes = write_datasets(table_name, ds, ["POLARIZATION_TYPE"]) dask.compute(writes) del writes assert_liveness(0, 0) datasets = read_datasets(table_name, [], [], [], chunks={'row': chunks['row']}) assert len(datasets) == 1 assert_array_equal(datasets[0].POLARIZATION_TYPE.data, np_names) del datasets assert_liveness(0, 0)
def test_dataset_updates(ms, select_cols, group_cols, index_cols, shapes, chunks): """ Test dataset writes """ # Get original STATE_ID and DATA with pt.table(ms, ack=False, readonly=True, lockoptions='auto') as T: original_state_id = T.getcol("STATE_ID") original_data = T.getcol("DATA") try: datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) assert_liveness(2, 1) # Test writes writes = [] states = [] datas = [] # Create write operations and execute them for i, ds in enumerate(datasets): state_var = (("row",), ds.STATE_ID.data + 1) data_var = (("row", "chan", "corr"), ds.DATA.data + 1, {}) states.append(state_var[1]) datas.append(data_var[1]) new_ds = ds.assign(STATE_ID=state_var, DATA=data_var) writes.append(write_datasets(ms, new_ds, ["STATE_ID", "DATA"])) _, states, datas = dask.compute(writes, states, datas) # NOTE(sjperkins) # Interesting behaviour here. If these objects are not # cleared up at this point, attempts to re-open the table below # can fail, reproducing https://github.com/ska-sa/dask-ms/issues/26 # Adding auto-locking to the table opening command seems to fix # this somehow del ds, new_ds, datasets, writes, state_var, data_var assert_liveness(0, 0) datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) for i, (ds, state, data) in enumerate(zip(datasets, states, datas)): assert_array_equal(ds.STATE_ID.data, state) assert_array_equal(ds.DATA.data, data) del ds, datasets assert_liveness(0, 0) finally: # Restore original STATE_ID with pt.table(ms, ack=False, readonly=False, lockoptions='auto') as T: state_id = T.getcol("STATE_ID") data = T.getcol("DATA") T.putcol("STATE_ID", original_state_id) T.putcol("DATA", original_data) # Compare against expected result assert_array_equal(original_state_id + 1, state_id) assert_array_equal(original_data + 1, data)
def test_dataset_assign(ms): """ Test dataset assignment """ datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # Assign on an existing column is easier because we can # infer the dimension schema from it nds = ds.assign(TIME=ds.TIME.data + 1) assert ds.DATA.data is nds.DATA.data assert ds.TIME.data is not nds.TIME.data assert_array_equal(nds.TIME.data, ds.TIME.data + 1) # This doesn't work for new columns with pytest.raises(ValueError, match="Couldn't find existing dimension"): ds.assign(ANTENNA3=ds.ANTENNA1.data + 3) # We have to explicitly supply a dimension schema nds = ds.assign(ANTENNA3=(("row", ), ds.ANTENNA1.data + 3)) assert_array_equal(ds.ANTENNA1.data + 3, nds.ANTENNA3.data) dims = ds.dims chunks = ds.chunks with pytest.raises(ValueError, match="size 9 for dimension 'row'"): array = da.zeros(dims['row'] - 1, chunks['row']) nds = ds.assign(ANTENNA4=(("row", ), array)) nds.dims assert chunks['row'] == (10, ) with pytest.raises(ValueError, match=r"chunking \(4, 4, 2\) for dim"): array = da.zeros(dims['row'], chunks=4) nds = ds.assign(ANTENNA4=(("row", ), array)) nds.chunks del datasets, ds, nds assert_liveness(0, 0)
def test_row_ordering_no_group(ms, index_cols, chunks): order_taql = ordering_taql(table_proxy(ms), index_cols) assert_liveness(2, 1) orders = row_ordering(order_taql, index_cols, chunks) assert_liveness(2, 1) # Normalise chunks to match that of the output array expected_chunks = da.core.normalize_chunks(chunks['row'], (10, )) assert orders[0].chunks == expected_chunks rowids = dask.compute(orders[0])[0] assert_array_equal(rowids, [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) del orders, order_taql assert_liveness(0, 0)