def __init__(self, table, select_cols, group_cols, index_cols, **kwargs): if not table_exists(table): raise ValueError("'%s' does not appear to be a CASA Table" % table) chunks = kwargs.pop('chunks', [{'row': _DEFAULT_ROW_CHUNKS}]) # Create or promote chunks to a list of dicts if isinstance(chunks, dict): chunks = [chunks] elif not isinstance(chunks, (tuple, list)): raise TypeError("'chunks' must be a dict or sequence of dicts") self.canonical_name = table self.table_path = str(Path(*table_path_split(table))) self.select_cols = select_cols self.group_cols = [] if group_cols is None else group_cols self.index_cols = [] if index_cols is None else index_cols self.chunks = chunks self.table_schema = kwargs.pop('table_schema', None) self.taql_where = kwargs.pop('taql_where', '') self.table_keywords = kwargs.pop('table_keywords', False) self.column_keywords = kwargs.pop('column_keywords', False) self.table_proxy = kwargs.pop('table_proxy', False) if len(kwargs) > 0: raise ValueError("Unhandled kwargs: %s" % kwargs)
def executor_key(table_name): """ Product an executor key from table_name """ # Remove any path separators root, table_name, subtable = table_path_split(table_name) return str(Path(root, table_name))
def test_read_array_names(ms): _, short_name, _ = table_path_split(ms) datasets = xds_from_ms(ms) for ds in datasets: for k, v in ds.data_vars.items(): product = ("~[" + str(ds.FIELD_ID) + "," + str(ds.DATA_DESC_ID) + "]") prefix = "".join(("read~", k, product)) assert key_split(v.data.name) == prefix
def _group_datasets(self, table_proxy, groups, exemplar_rows, orders): _, t, s = table_path_split(self.canonical_name) short_table_name = '/'.join((t, s)) if s else t table_schema = self._table_schema() datasets = [] group_ids = list(zip(*groups)) assert len(group_ids) == len(orders) # Select columns, excluding grouping columns select_cols = set(self.select_cols or table_proxy.colnames().result()) select_cols -= set(self.group_cols) # Create a dataset for each group it = enumerate(zip(group_ids, exemplar_rows, orders)) for g, (group_id, exemplar_row, order) in it: # Extract group chunks try: group_chunks = self.chunks[g] # Get group chunking strategy except IndexError: group_chunks = self.chunks[-1] # Re-use last group's chunks # Prefix dataset gid_str = ",".join(str(gid) for gid in group_id) array_suffix = f"[{gid_str}]-{short_table_name}" # Create dataset variables group_var_dims = _dataset_variable_factory( table_proxy, table_schema, select_cols, exemplar_row, order, group_chunks, array_suffix) # Extract ROWID try: rowid = group_var_dims.pop("ROWID") except KeyError: coords = None else: coords = {"ROWID": rowid} # Assign values for the dataset's grouping columns # as attributes partitions = tuple( (c, g.dtype.name) for c, g in zip(self.group_cols, group_id)) attrs = {DASKMS_PARTITION_KEY: partitions} # Use python types which are json serializable group_id = [gid.item() for gid in group_id] attrs.update(zip(self.group_cols, group_id)) datasets.append(Dataset(group_var_dims, attrs=attrs, coords=coords)) return datasets
def infer_table_type(table_name): """ Guess the schema from the table name """ _, table, subtable = table_path_split(table_name) if not subtable and table[-3:].upper().endswith(".MS"): return "MS" if subtable in _SUBTABLE_SCHEMAS.keys(): return subtable return "TABLE"
def test_write_array_names(ms, tmp_path): _, short_name, _ = table_path_split(ms) datasets = xds_from_ms(ms) out_table = str(tmp_path / short_name) writes = xds_to_table(datasets, out_table, "ALL") for ds in writes: for k, v in ds.data_vars.items(): prefix = "".join(("write~", k)) assert key_split(v.data.name) == prefix
def _create_table(table_name, datasets, columns, descriptor): builder = descriptor_builder(table_name, descriptor) schemas = [DatasetSchema.from_dataset(ds, columns) for ds in datasets] table_desc, dminfo = builder.execute(schemas) root, table, subtable = table_path_split(table_name) table_path = root / table from daskms.descriptors.ms import MSDescriptorBuilder from daskms.descriptors.ms_subtable import MSSubTableDescriptorBuilder if not subtable and isinstance(builder, MSDescriptorBuilder): table_path = str(table_path) # Create the MS with pt.default_ms(table_path, tabdesc=table_desc, dminfo=dminfo): pass return _writable_table_proxy(table_path) elif subtable: # NOTE(sjperkins) # Recreate the subtable path with OS separator components # This avoids accessing the subtable via the main table # (e.g. WSRT.MS::SOURCE) # which can cause lock issues as the subtables seemingly # inherit the parent table lock subtable_path = str(table_path / subtable) # Create the subtable if isinstance(builder, MSSubTableDescriptorBuilder): with pt.default_ms_subtable(subtable, subtable_path, tabdesc=table_desc, dminfo=dminfo): pass else: with pt.table(subtable_path, table_desc, dminfo=dminfo, ack=False): pass # Add subtable to the main table table_proxy = _writable_table_proxy(str(table_path)) table_proxy.putkeywords({subtable: "Table: " + subtable_path}).result() del table_proxy # Return TableProxy return _writable_table_proxy(subtable_path) else: # Create the table with pt.table(str(table_path), table_desc, dminfo=dminfo, ack=False): pass return _writable_table_proxy(str(table_path))
def _group_datasets(self, groups, exemplar_rows, orders): _, t, s = table_path_split(self.canonical_name) short_table_name = '/'.join((t, s)) if s else t table_proxy = self._table_proxy() table_schema = self._table_schema() datasets = [] group_ids = list(zip(*groups)) assert len(group_ids) == len(orders) # Select columns, excluding grouping columns select_cols = set(self.select_cols or table_proxy.colnames().result()) select_cols -= set(self.group_cols) # Create a dataset for each group it = enumerate(zip(group_ids, exemplar_rows, orders)) for g, (group_id, exemplar_row, order) in it: # Extract group chunks try: group_chunks = self.chunks[g] # Get group chunking strategy except IndexError: group_chunks = self.chunks[-1] # Re-use last group's chunks # Prefix d gid_str = ",".join(str(gid) for gid in group_id) array_prefix = "%s-[%s]" % (short_table_name, gid_str) # Create dataset variables group_var_dims = _dataset_variable_factory( table_proxy, table_schema, select_cols, exemplar_row, order, group_chunks, array_prefix) # Extract ROWID try: rowid = group_var_dims.pop("ROWID") except KeyError: coords = None else: coords = {"ROWID": rowid} # Assign values for the dataset's grouping columns # as attributes attrs = dict(zip(self.group_cols, group_id)) datasets.append(Dataset(group_var_dims, attrs=attrs, coords=coords)) return datasets
def _single_dataset(self, table_proxy, orders, exemplar_row=0): _, t, s = table_path_split(self.canonical_name) short_table_name = "/".join((t, s)) if s else t table_schema = self._table_schema() select_cols = set(self.select_cols or table_proxy.colnames().result()) variables = _dataset_variable_factory(table_proxy, table_schema, select_cols, exemplar_row, orders, self.chunks[0], short_table_name) try: rowid = variables.pop("ROWID") except KeyError: coords = None else: coords = {"ROWID": rowid} return Dataset(variables, coords=coords)
def filename_builder_factory(filename): """ Returns a Table Descriptor Builder based on the filename. 1. If ending with a '.ms' (case insensitive), its assumed a Measurement Set is being created. 2. If ending in '::SUBTABLE' where SUBTABLE is a Measurement Set sub-table such as ANTENNA, SPECTRAL_WINDOW, its assumed that sub-table is being created. 3. Otherwise its assumed a default CASA table is being created. Parameters ---------- filename : str Table filename Returns ------- builder : :class:`daskms.descriptors.builder.AbtractDescriptorBuilder` Table Descriptor builder based on the filename """ _, table, subtable = table_path_split(filename) # Does this look like an MS if not subtable and table[-3:].upper().endswith('.MS'): from daskms.descriptors.ms import MSDescriptorBuilder return MSDescriptorBuilder() # Perhaps its an MS subtable? if subtable in SUBTABLES: from daskms.descriptors.ms_subtable import MSSubTableDescriptorBuilder return MSSubTableDescriptorBuilder(subtable) # Just a standard CASA Table I guess from daskms.descriptors.builder import DefaultDescriptorBuilder return DefaultDescriptorBuilder()
def test_table_path_split(path, root, table, subtable): assert (root, table, subtable) == table_path_split(path)
def _write_datasets(table, table_proxy, datasets, columns, descriptor, table_keywords, column_keywords): _, table_name, subtable = table_path_split(table) table_name = '::'.join((table_name, subtable)) if subtable else table_name row_orders = [] # Put table and column keywords table_proxy.submit(_put_keywords, WRITELOCK, table_keywords, column_keywords).result() # Sort datasets on (not has "ROWID", index) such that # datasets with ROWID's are handled first, while # those without (which imply appends to the MS) # are handled last sorted_datasets = sorted(enumerate(datasets), key=lambda t: ("ROWID" not in t[1].data_vars, t[0])) # Establish row orders for each dataset for di, ds in sorted_datasets: try: rowid = ds.ROWID.data except AttributeError: # Add operation # No ROWID's, assume they're missing from the table # and remaining datasets. Generate addrows # NOTE(sjperkins) # This could be somewhat brittle, but exists to # update MS empty subtables once they've been # created along with the main MS by a call to default_ms. # Users could also it to append rows to an existing table. # An xds_append_to_table may be a better solution... last_datasets = datasets[di:] last_row_orders = add_row_order_factory(table_proxy, last_datasets) # We don't inline the row ordering if it is derived # from the row sizes of provided arrays. # The range of possible dependencies are far too large to inline row_orders.extend([(False, lro) for lro in last_row_orders]) # We have established row orders for all datasets # at this point, quit the loop break else: # Update operation # Generate row orderings from existing row IDs row_order = rowid.map_blocks(row_run_factory, sort_dir="write", dtype=np.object) # TODO(sjperkins) # There's an assumption here that rowid is an # operation with minimal dependencies # (i.e. derived from xds_from_{ms, table}) # Caching flattens the graph into a single layer if len(row_order.__dask_graph__().layers) > 1: log.warning("Caching an update row ordering " "with more than one layer") row_order = cached_array(row_order) # Inline the row ordering in the graph row_orders.append((True, row_order)) assert len(row_orders) == len(datasets) datasets = [] for (di, ds), (inline, row_order) in zip(sorted_datasets, row_orders): # Hold the variables representing array writes write_vars = {} # Generate a dask array for each column for column in columns: try: variable = ds.data_vars[column] except KeyError: log.warning("Ignoring '%s' not present " "on dataset %d" % (column, di)) continue else: full_dims = variable.dims array = variable.data if not isinstance(array, da.Array): raise TypeError("%s on dataset %d is not a dask Array " "but a %s" % (column, di, type(array))) args = [row_order, ("row", )] # We only need to pass in dimension extent arrays if # there is more than one chunk in any of the non-row columns. # In that case, we can putcol, otherwise putcolslice is required if not all(len(c) == 1 for c in array.chunks[1:]): # Add extent arrays for d, c in zip(full_dims[1:], array.chunks[1:]): args.append(dim_extents_array(d, c)) args.append((d, )) # Add other variables args.extend([table_proxy, None, column, None, array, full_dims]) # Name of the dask array representing this column token = dask.base.tokenize(di, args) name = "-".join((table_name, 'write', column, token)) write_col = da.blockwise( putter_wrapper, full_dims, *args, # All dims shrink to 1, # a single bool is returned adjust_chunks={d: 1 for d in full_dims}, name=name, align_arrays=False, dtype=np.bool) if inline: write_col = inlined_array(write_col, [row_order]) write_vars[column] = (full_dims, write_col) # Append a dataset with the write operations datasets.append(Dataset(write_vars)) # Return an empty dataset if len(datasets) == 0: return Dataset({}) # Return singleton elif len(datasets) == 1: return datasets[0] return datasets
def _write_datasets(table, table_proxy, datasets, columns, descriptor, table_keywords, column_keywords): _, table_name, subtable = table_path_split(table) table_name = '::'.join((table_name, subtable)) if subtable else table_name writes = [] row_orders = [] # Put table and column keywords table_proxy.submit(_put_keywords, WRITELOCK, table_keywords, column_keywords).result() # Sort datasets on (not has "ROWID", index) such that # datasets with ROWID's are handled first, while # those without (which imply appends to the MS) # are handled last sorted_datasets = sorted(enumerate(datasets), key=lambda t: ("ROWID" not in t[1].data_vars, t[0])) # Establish row orders for each dataset for di, ds in sorted_datasets: try: rowid = ds.ROWID.data except AttributeError: # No ROWID's, assume they're missing from the table # and remaining datasets. Generate addrows # NOTE(sjperkins) # This could be somewhat brittle, but exists to # update of MS subtables once they've been # created (empty) along with the main MS by a call to default_ms. # Users could also it to append rows to an existing table. # An xds_append_to_table is probably the correct solution... last_datasets = datasets[di:] last_row_orders = add_row_order_factory(table_proxy, last_datasets) row_orders.extend(last_row_orders) # We have established row orders for all datasets # at this point, quit the loop break else: # Generate row orderings from existing row IDs row_order = rowid.map_blocks(row_run_factory, sort_dir="write", dtype=np.object) row_orders.append(row_order) assert len(row_orders) == len(datasets) for (di, ds), row_order in zip(sorted_datasets, row_orders): data_vars = ds.data_vars # Generate a dask array for each column for column in columns: try: variable = data_vars[column] except KeyError: log.warning("Ignoring '%s' not present " "on dataset %d" % (column, di)) continue else: full_dims = variable.dims array = variable.data args = [row_order, ("row", )] # We only need to pass in dimension extent arrays if # there is more than one chunk in any of the non-row columns. # In that case, we can putcol, otherwise putcolslice is required if not all(len(c) == 1 for c in array.chunks[1:]): # Add extent arrays for d, c in zip(full_dims[1:], array.chunks[1:]): args.append(dim_extents_array(d, c)) args.append((d, )) # Add other variables args.extend([table_proxy, None, column, None, array, full_dims]) # Name of the dask array representing this column token = dask.base.tokenize(di, args) name = "-".join((table_name, 'write', column, token)) write_col = da.blockwise( putter_wrapper, full_dims, *args, # All dims shrink to 1, # a single bool is returned adjust_chunks={d: 1 for d in full_dims}, name=name, align_arrays=False, dtype=np.bool) writes.append(write_col.ravel()) if len(writes) == 0: return da.full(1, True, dtype=np.bool) return da.concatenate(writes)