def test_column_metadata(ms, column, shape, chunks, table_schema, dtype): table_proxy = TableProxy(pt.table, ms, readonly=True, ack=False) assert_liveness(1, 1) try: dims = table_schema[column]['dims'] except KeyError: dims = tuple("%s-%d" % (column, i) for i in range(1, len(shape) + 1)) meta = column_metadata(column, table_proxy, table_schema, dict(chunks)) assert meta.shape == shape assert meta.dims == dims assert meta.chunks == [c[1] for c in chunks[:len(meta.shape)]] assert meta.dtype == dtype del table_proxy assert_liveness(0, 0)
def _dataset_variable_factory(table_proxy, table_schema, select_cols, exemplar_row, orders, chunks, array_prefix): """ Returns a dictionary of dask arrays representing a series of getcols on the appropriate table. Produces variables for inclusion in a Dataset. Parameters ---------- table_proxy : :class:`daskms.table_proxy.TableProxy` Table proxy object table_schema : dict Table schema select_cols : list of strings List of columns to return exemplar_row : int row id used to possibly extract an exemplar array in order to determine the column shape and dtype attributes orders : tuple of :class:`dask.array.Array` A (sorted_rows, row_runs) tuple, specifying the appropriate rows to extract from the table. chunks : dict Chunking strategy for the dataset. array_prefix : str dask array string prefix Returns ------- dict A dictionary looking like :code:`{column: (arrays, dims)}`. """ sorted_rows, row_runs = orders dataset_vars = {"ROWID": (("row", ), sorted_rows)} for column in select_cols: try: meta = column_metadata(column, table_proxy, table_schema, chunks, exemplar_row) except ColumnMetadataError as e: exc_info = logging.DEBUG >= log.getEffectiveLevel() log.warning("Ignoring '%s': %s", column, e, exc_info=exc_info) continue full_dims = ("row", ) + meta.dims args = [row_runs, ("row", )] # We only need to pass in dimension extent arrays if # there is more than one chunk in any of the non-row columns. # In that case, we can getcol, otherwise getcolslice is required if not all(len(c) == 1 for c in meta.chunks): for d, c in zip(meta.dims, meta.chunks): # Create an array describing the dimension chunk extents args.append(dim_extents_array(d, c)) args.append((d, )) new_axes = {} else: # We need to inform blockwise about the size of our # new dimensions as no arrays with them are supplied new_axes = {d: s for d, s in zip(meta.dims, meta.shape)} # Add other variables args.extend([ table_proxy, None, column, None, meta.shape, None, meta.dtype, None ]) # Name of the dask array representing this column token = dask.base.tokenize(args) name = "-".join((array_prefix, column, token)) # Construct the array dask_array = da.blockwise(getter_wrapper, full_dims, *args, name=name, new_axes=new_axes, dtype=meta.dtype) dask_array = inlined_array(dask_array) # Assign into variable and dimension dataset dataset_vars[column] = (full_dims, dask_array) return dataset_vars