def test_row_ordering_multiple_groups(ms, group_cols, index_cols, chunks): group_taql = group_ordering_taql(table_proxy(ms), group_cols, index_cols) assert_liveness(2, 1) orders = group_row_ordering(group_taql, group_cols, index_cols, chunks) assert_liveness(2, 1) first_rows = group_taql.getcol("__firstrow__").result() assert_liveness(2, 1) # We get two groups out assert len(orders) == len(first_rows) == 2 assert_array_equal(first_rows, [0, 7]) rowid_arrays = tuple(o[0] for o in orders) rowids = dask.compute(rowid_arrays)[0] # Check the two resulting groups # Normalise chunks to match that of the output array row_chunks = chunks[0]['row'] expected_chunks = da.core.normalize_chunks(row_chunks, (7, )) assert_array_equal(rowids[0], [6, 5, 4, 3, 2, 1, 0]) assert rowid_arrays[0].chunks == expected_chunks # If chunks only supplied for the first group, re-use it's chunking row_chunks = chunks[0]['row'] if len(chunks) == 1 else chunks[1]['row'] expected_chunks = da.core.normalize_chunks(row_chunks, (3, )) assert_array_equal(rowids[1], [9, 8, 7]) assert rowid_arrays[1].chunks == expected_chunks del first_rows, orders, rowid_arrays, group_taql assert_liveness(0, 0)
def test_ordering_multiple_groups(ms, group_cols, index_cols): group_taql = group_ordering_taql(table_proxy(ms), group_cols, index_cols) assert_liveness(2, 1) orders = group_row_ordering(group_taql, group_cols, index_cols, [{ 'row': 2 }]) assert_liveness(2, 1) first_rows = group_taql.getcol("__firstrow__").result() assert_liveness(2, 1) assert len(first_rows) == len(orders) == 6 assert_array_equal(first_rows, [0, 1, 3, 4, 7, 8]) rowid_arrays = tuple(o[0] for o in orders) rowids = dask.compute(rowid_arrays)[0] assert_array_equal(rowids[0], [2, 0]) assert_array_equal(rowids[1], [1]) assert_array_equal(rowids[2], [5, 3]) assert_array_equal(rowids[3], [6, 4]) assert_array_equal(rowids[4], [9, 7]) assert_array_equal(rowids[5], [8]) del first_rows, orders, rowid_arrays, group_taql assert_liveness(0, 0)
def datasets(self): table_proxy = self._table_proxy() # No grouping case if len(self.group_cols) == 0: order_taql = ordering_taql(table_proxy, self.index_cols, self.taql_where) orders = row_ordering(order_taql, self.index_cols, self.chunks[0]) datasets = [self._single_dataset(orders)] # Group by row elif len(self.group_cols) == 1 and self.group_cols[0] == "__row__": order_taql = ordering_taql(table_proxy, self.index_cols, self.taql_where) sorted_rows, row_runs = row_ordering( order_taql, self.index_cols, # chunk ordering on each row dict(self.chunks[0], row=1)) # Produce a dataset for each chunk (block), # each containing a single row row_blocks = sorted_rows.blocks run_blocks = row_runs.blocks # Exemplar actually correspond to the sorted rows. # We reify them here so they can be assigned on each # dataset as an attribute np_sorted_row = sorted_rows.compute() datasets = [ self._single_dataset((row_blocks[r], run_blocks[r]), exemplar_row=er) for r, er in enumerate(np_sorted_row) ] # Grouping column case else: order_taql = group_ordering_taql(table_proxy, self.group_cols, self.index_cols, self.taql_where) orders = group_row_ordering(order_taql, self.group_cols, self.index_cols, self.chunks) groups = [order_taql.getcol(g).result() for g in self.group_cols] exemplar_rows = order_taql.getcol("__firstrow__").result() assert len(orders) == len(exemplar_rows) datasets = self._group_datasets(groups, exemplar_rows, orders) ret = (datasets, ) if self.table_keywords is True: ret += (table_proxy.getkeywords().result(), ) if self.column_keywords is True: keywords = table_proxy.submit(_col_keyword_getter, READLOCK) ret += (keywords.result(), ) if len(ret) == 1: return ret[0] return ret