def test_promotion(columns, default): # Lists stay as lists assert promote_columns(columns, default) == columns # Tuples promoted to lists assert promote_columns(tuple(columns), default) == columns # Singleton promoted to list assert promote_columns(columns[0], default) == [columns[0]] # None gives us the default assert promote_columns(None, default) == default
def xds_from_ms(ms, columns=None, index_cols=None, group_cols=None, **kwargs): """ Generator yielding a series of xarray datasets representing the contents a Measurement Set. It defers to :func:`xds_from_table`, which should be consulted for more information. Parameters ---------- ms : str Measurement Set filename columns : tuple or list, optional Columns present on the resulting dataset. Defaults to all if ``None``. index_cols : tuple or list, optional Sequence of indexing columns. Defaults to :code:`%(index)s` group_cols : tuple or list, optional Sequence of grouping columns. Defaults to :code:`%(parts)s` **kwargs : optional Returns ------- datasets : list of :class:`xarray.Dataset` xarray datasets for each group """ columns = promote_columns(columns, []) index_cols = promote_columns(index_cols, _DEFAULT_INDEX_COLUMNS) group_cols = promote_columns(group_cols, _DEFAULT_GROUP_COLUMNS) kwargs.setdefault("table_schema", "MS") return xds_from_table(ms, columns=columns, index_cols=index_cols, group_cols=group_cols, **kwargs)
def xds_from_table(table_name, columns=None, index_cols=None, group_cols=None, **kwargs): """ Create multiple :class:`xarray.Dataset` objects from CASA table ``table_name`` with the rows lexicographically sorted according to the columns in ``index_cols``. If ``group_cols`` is supplied, the table data is grouped into multiple :class:`xarray.Dataset` objects, each associated with a permutation of the unique values for the columns in ``group_cols``. Notes ----- Both ``group_cols`` and ``index_cols`` should consist of columns that are part of the table index. However, this may not always be possible as CASA tables may not always contain indexing columns. The ``ANTENNA`` or ``SPECTRAL_WINDOW`` Measurement Set subtables are examples in which the ``row id`` serves as the index. Generally, calling .. code-block:: python antds = list(xds_from_table("WSRT.MS::ANTENNA")) is fine, since the data associated with each row of the ``ANTENNA`` table has the same shape and so a dask or numpy array can be constructed around the contents of the table. This may not be the case for the ``SPECTRAL_WINDOW`` subtable. Here, each row defines a separate spectral window, but each spectral window may contain different numbers of frequencies. In this case, it is probably better to group the subtable by ``row``. There is a *special* group column :code:`"__row__"` that can be used to group the table by row. .. code-block:: python for spwds in xds_from_table("WSRT.MS::SPECTRAL_WINDOW", group_cols="__row__"): ... If :code:`"__row__"` is used for grouping, then no other column may be used. It should also only be used for *small* tables, as the number of datasets produced, may be prohibitively large. Parameters ---------- table_name : str CASA table columns : list or tuple, optional Columns present on the returned dataset. Defaults to all if ``None`` index_cols : list or tuple, optional List of CASA table indexing columns. Defaults to :code:`()`. group_cols : list or tuple, optional List of columns on which to group the CASA table. Defaults to :code:`()` table_schema : dict or str or list of dict or str, optional A schema dictionary defining the dimension naming scheme for each column in the table. For example: .. code-block:: python { "UVW": {'dims': ('uvw',)}, "DATA": {'dims': ('chan', 'corr')}, } will result in the UVW and DATA arrays having dimensions :code:`('row', 'uvw')` and :code:`('row', 'chan', 'corr')` respectively. A string can be supplied, which will be matched against existing default schemas. Examples here include ``MS``, ``ANTENNA`` and ``SPECTRAL_WINDOW`` corresponding to ``Measurement Sets`` the ``ANTENNA`` subtable and the ``SPECTRAL_WINDOW`` subtable, respectively. By default, the end of ``table_name`` will be inspected to see if it matches any default schemas. It is also possible to supply a list of strings or dicts defining a sequence of schemas which are combined. Later elements in the list override previous elements. In the following example, the standard UVW MS component name scheme is overridden with "my-uvw". .. code-block:: python ["MS", {"UVW": {'dims': ('my-uvw',)}}] table_keywords : {False, True}, optional If True, returns table keywords. Changes return type of the function into a tuple column_keywords : {False, True}, optional If True return keywords for each column on the table Changes return type of the function into a tuple table_proxy : {False, True}, optional If True returns the Table Proxy associated with the Dataset taql_where : str, optional TAQL where clause. For example, to exclude auto-correlations .. code-block:: python xds_from_table("WSRT.MS", taql_where="ANTENNA1 != ANTENNA2") chunks : list of dicts or dict, optional A :code:`{dim: chunk}` dictionary, specifying the chunking strategy of each dimension in the schema. Defaults to :code:`{'row': 100000 }` which will partition the row dimension into chunks of 100000. * If a dict, the chunking strategy is applied to each group. * If a list of dicts, each element is applied to the associated group. The last element is extended over the remaining groups if there are insufficient elements. It's also possible to specify the individual chunks for multiple dimensions: .. code-block:: python {'row': (40000, 60000, 40000, 60000), 'chan': (16, 16, 16, 16), 'corr': (1, 2, 1)} The above chunks a 200,000 row, 64 channel and 4 correlation space into 4 x 4 x 3 = 48 chunks, but requires prior knowledge of dimensionality, probably obtained with an initial call to :func:`xds_from_table`. Returns ------- datasets : list of :class:`xarray.Dataset` datasets for each group, each ordered by indexing columns table_keywords : dict, optional Returned if ``table_keywords is True`` column_keywords : dict, optional Returned if ``column_keywords is True`` table_proxy : :class:`daskms.TableProxy`, optional Returned if ``table_proxy is True`` """ columns = promote_columns(columns, []) index_cols = promote_columns(index_cols, []) group_cols = promote_columns(group_cols, []) return DatasetFactory(table_name, columns, group_cols, index_cols, **kwargs).datasets()
def xds_from_table(table_name, columns=None, index_cols=None, group_cols=None, **kwargs): """ Create multiple :class:`xarray.Dataset` objects from CASA table ``table_name`` with the rows lexicographically sorted according to the columns in ``index_cols``. If ``group_cols`` is supplied, the table data is grouped into multiple :class:`xarray.Dataset` objects, each associated with a permutation of the unique values for the columns in ``group_cols``. Notes ----- Both ``group_cols`` and ``index_cols`` should consist of columns that are part of the table index. However, this may not always be possible as CASA tables may not always contain indexing columns. The ``ANTENNA`` or ``SPECTRAL_WINDOW`` Measurement Set subtables are examples in which the ``row id`` serves as the index. Generally, calling .. code-block:: python antds = list(xds_from_table("WSRT.MS::ANTENNA")) is fine, since the data associated with each row of the ``ANTENNA`` table has the same shape and so a dask or numpy array can be constructed around the contents of the table. This may not be the case for the ``SPECTRAL_WINDOW`` subtable. Here, each row defines a separate spectral window, but each spectral window may contain different numbers of frequencies. In this case, it is probably better to group the subtable by ``row``. There is a *special* group column :code:`"__row__"` that can be used to group the table by row. .. code-block:: python for spwds in xds_from_table("WSRT.MS::SPECTRAL_WINDOW", group_cols="__row__"): ... If :code:`"__row__"` is used for grouping, then no other column may be used. It should also only be used for *small* tables, as the number of datasets produced, may be prohibitively large. Parameters ---------- table_name : str CASA table columns : list or tuple, optional Columns present on the returned dataset. Defaults to all if ``None`` index_cols : list or tuple, optional List of CASA table indexing columns. Defaults to :code:`()`. group_cols : list or tuple, optional List of columns on which to group the CASA table. Defaults to :code:`()` table_schema : dict or str or list of dict or str, optional A schema dictionary defining the dimension naming scheme for each column in the table. For example: .. code-block:: python { "UVW": {'dims': ('uvw',)}, "DATA": {'dims': ('chan', 'corr')}, } will result in the UVW and DATA arrays having dimensions :code:`('row', 'uvw')` and :code:`('row', 'chan', 'corr')` respectively. A string can be supplied, which will be matched against existing default schemas. Examples here include ``MS``, ``ANTENNA`` and ``SPECTRAL_WINDOW`` corresponding to ``Measurement Sets`` the ``ANTENNA`` subtable and the ``SPECTRAL_WINDOW`` subtable, respectively. By default, the end of ``table_name`` will be inspected to see if it matches any default schemas. It is also possible to supply a list of strings or dicts defining a sequence of schemas which are combined. Later elements in the list override previous elements. In the following example, the standard UVW MS component name scheme is overridden with "my-uvw". .. code-block:: python ["MS", {"UVW": {'dims': ('my-uvw',)}}] table_keywords : {False, True}, optional If True, returns table keywords. Changes return type of the function into a tuple column_keywords : {False, True}, optional If True return keywords for each column on the table Changes return type of the function into a tuple taql_where : str, optional TAQL where clause. For example, to exclude auto-correlations .. code-block:: python xds_from_table("WSRT.MS", taql_where="ANTENNA1 != ANTENNA2") chunks : list of dicts or dict, optional A :code:`{dim: chunk}` dictionary, specifying the chunking strategy of each dimension in the schema. Defaults to :code:`{'row': 100000 }`. * If a dict, the chunking strategy is applied to each group. * If a list of dicts, each element is applied to the associated group. The last element is extended over the remaining groups if there are insufficient elements. Returns ------- datasets : list of :class:`xarray.Dataset` datasets for each group, each ordered by indexing columns table_keywords : dict, optional Returned if ``table_keywords==True`` column_keywords : dict, optional return if ``column_keywords==True`` """ columns = promote_columns(columns, []) index_cols = promote_columns(index_cols, []) group_cols = promote_columns(group_cols, []) dask_datasets = DatasetFactory(table_name, columns, group_cols, index_cols, **kwargs).datasets() # Return dask datasets if xarray is not available if xr is None: return dask_datasets xarray_datasets = [] # Extract dataset list in case of table_keyword and column_keyword returns if isinstance(dask_datasets, tuple): extra = dask_datasets[1:] dask_datasets = dask_datasets[0] else: extra = () # Convert each dask dataset into an xarray dataset for ds in dask_datasets: data_vars = collections.OrderedDict() coords = collections.OrderedDict() for k, v in sorted(ds.data_vars.items()): data_vars[k] = xr.DataArray(v.data, dims=v.dims, attrs=v.attrs) for k, v in sorted(ds.coords.items()): coords[k] = xr.DataArray(v.data, dims=v.dims, attrs=v.attrs) xarray_datasets.append(xr.Dataset(data_vars, attrs=dict(ds.attrs), coords=coords)) if len(extra) > 0: return (xarray_datasets,) + extra return xarray_datasets