예제 #1
0
파일: convert.py 프로젝트: ska-sa/dask-ms
def _check_input_path(input: str):
    input_path = DaskMSStore(input)

    if not input_path.exists():
        raise ArgumentTypeError(f"{input} is an invalid path.")

    return input_path
예제 #2
0
def test_storage_options_from_config(tmp_path, py_minio_client, minio_admin,
                                     minio_alias, minio_user_key, minio_url,
                                     s3_bucket_name):
    filename = "test.txt"
    payload = "How now brown cow"
    py_minio_client.make_bucket(s3_bucket_name)
    py_minio_client.put_object(s3_bucket_name, f"subdir/{filename}",
                               BytesIO(payload.encode("utf-8")), len(payload))

    url = f"s3://{s3_bucket_name}"
    config_file = tmp_path / "config.yaml"
    opts = {
        "key": minio_user_key,
        "secret": minio_user_key,
        "client_kwargs": {
            "endpoint_url": minio_url.geturl(),
            "region_name": "af-south-1",
            "verify": False,
        },
    }

    with open(config_file, "w") as f:
        yaml.safe_dump({"storage_options": {url: opts}}, f)

    config.refresh(paths=config.paths + [str(tmp_path)])

    try:
        store = DaskMSStore(f"{url}/subdir")
        assert store.storage_options == opts

        with store.open("test.txt", "rb") as f:
            assert f.read() == payload.encode("utf-8")
    finally:
        config.refresh()
예제 #3
0
def test_xds_to_zarr_local(ms, spw_table, ant_table, tmp_path_factory):
    zarr_store = tmp_path_factory.mktemp("zarr_store") / "test.zarr"
    spw_store = zarr_store.parent / f"{zarr_store.name}::SPECTRAL_WINDOW"
    ant_store = zarr_store.parent / f"{zarr_store.name}::ANTENNA"

    return zarr_tester(ms, spw_table, ant_table,
                       DaskMSStore(zarr_store),
                       DaskMSStore(spw_store),
                       DaskMSStore(ant_store))
예제 #4
0
def test_store_main_access(tmp_path_factory):
    store_dir = tmp_path_factory.mktemp("STORE0")

    store = DaskMSStore(f"file://{store_dir}")
    assert store.url == f"file://{store_dir}"
    assert store.full_path == str(store_dir)
    assert store.canonical_path == str(store_dir)
    assert store.table is None

    with store.open("foo.txt", "w") as f:
        f.write("How now brown cow")

    assert store.exists("foo.txt")
    assert (store_dir / "foo.txt").exists()
예제 #5
0
def test_store_subtable_access(tmp_path_factory):
    store_dir = tmp_path_factory.mktemp("STORE0")
    table_dir = store_dir / "TABLE"
    table_dir.mkdir()

    store = DaskMSStore(f"file://{store_dir}::TABLE")
    assert store.url == f"file://{store_dir}::TABLE"
    assert store.full_path == f"{store_dir}{store.fs.sep}TABLE"
    assert store.canonical_path == f"{store_dir}::TABLE"
    assert store.table == "TABLE"

    with store.open("foo.txt", "w") as f:
        f.write("How now brown cow")

    assert store.exists("foo.txt")
    assert (table_dir / "foo.txt").exists()
예제 #6
0
def xds_from_storage_ms(store, **kwargs):
    if not isinstance(store, DaskMSStore):
        store = DaskMSStore(store, **kwargs.pop("storage_options", {}))

    typ = store.type()

    if typ == "casa":
        return xds_from_ms(store, **kwargs)
    elif typ == "zarr":
        from daskms.experimental.zarr import xds_from_zarr
        return xds_from_zarr(store, **kwargs)
    elif typ == "parquet":
        from daskms.experimental.arrow import xds_from_parquet
        return xds_from_parquet(store, **kwargs)
    else:
        raise TypeError(f"Unknown dataset {typ}")
예제 #7
0
def xds_to_storage_table(xds, store, **kwargs):
    if not isinstance(store, DaskMSStore):
        store = DaskMSStore(store, **kwargs.pop("storage_options", {}))

    typ = store.type()

    if typ == "casa":
        filter_kwargs(xds_to_table, kwargs)
        return xds_to_table(xds, store, **kwargs)
    elif typ == "zarr":
        from daskms.experimental.zarr import xds_to_zarr
        filter_kwargs(xds_to_zarr, kwargs)
        return xds_to_zarr(xds, store, **kwargs)
    elif typ == "parquet":
        from daskms.experimental.arrow import xds_to_parquet
        filter_kwargs(xds_to_parquet, kwargs)
        return xds_to_parquet(xds, store, **kwargs)
    else:
        raise TypeError(f"Unknown dataset {typ}")
예제 #8
0
def test_store_pickle():
    store = DaskMSStore("s3://binface",
                        key="foo",
                        secret="bar",
                        client_kwargs={
                            "endpoint_url": "http://127.0.0.1:9000",
                            "region_name": "af-cpt"
                        })

    pstore = pickle.loads(pickle.dumps(store))
    assert pstore == store
예제 #9
0
def test_xds_to_zarr_s3(ms, spw_table, ant_table,
                        py_minio_client, minio_user_key,
                        minio_url, s3_bucket_name):

    py_minio_client.make_bucket(s3_bucket_name)

    zarr_store = DaskMSStore(f"s3://{s3_bucket_name}/measurementset.MS",
                             key=minio_user_key,
                             secret=minio_user_key,
                             client_kwargs={
                                 "endpoint_url": minio_url.geturl(),
                                 "region_name": "af-cpt",
                             })

    # NOTE(sjperkins)
    # Review this interface
    spw_store = zarr_store.subtable_store("SPECTRAL_WINDOW")
    ant_store = zarr_store.subtable_store("ANTENNA")

    return zarr_tester(ms, spw_table, ant_table,
                       zarr_store, spw_store, ant_store)
예제 #10
0
def test_local_store(tmp_path):
    zarr = pytest.importorskip("zarr")
    payload = "How now brown cow"
    filename = "cow.txt"
    (tmp_path / filename).write_text(payload)
    (tmp_path / "foo.txt").write_text(payload)
    (tmp_path / "bar.txt").write_text(payload)
    (tmp_path / "qux.txt").write_text(payload)
    store = DaskMSStore(str(tmp_path))
    store.fs.mkdir(f"{store.full_path}{store.fs.sep}bob", exist_ok=True)

    assert store.map[filename] == payload.encode("utf-8")

    root = zarr.group(store=store.map)
    data = root.require_dataset(
        "MODEL_DATA",  # noqa
        shape=1000,
        dtype=np.complex128)
예제 #11
0
def xds_from_parquet(store, columns=None, chunks=None, **kwargs):
    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (str, Path)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_from_parquet: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if len(chunks) == 0 or any(not isinstance(c, dict) for c in chunks):
            raise TypeError("chunks must be None or dict or list of dict")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None or dict or list of dict")

    table_path = "" if store.table else "MAIN"

    fragments = list(map(Path, store.rglob("*.parquet")))
    ds_cfg = defaultdict(list)

    # Iterate over all parquet files in the directory tree
    # and group them by partition
    partition_schemas = set()

    for fragment in fragments:
        *partitions, _ = fragment.relative_to(Path(table_path)).parts
        fragment = ParquetFileProxy(store, str(fragment))
        fragment_meta = fragment.metadata
        metadata = json.loads(fragment_meta.metadata[DASKMS_METADATA.encode()])
        partition_meta = metadata[DASKMS_PARTITION_KEY]
        partition_meta = tuple(tuple((f, v)) for f, v in partition_meta)
        partitions = _partition_values(partitions, partition_meta)
        partition_schemas.add(partition_meta)
        ds_cfg[partitions].append(fragment)

    # Sanity check partition schemas of all parquet files
    if len(partition_schemas) == 0:
        raise ValueError(f"No parquet files found in {store.path}")
    elif len(partition_schemas) != 1:
        raise ValueError(f"Multiple partitions discovered {partition_schemas}")

    partition_schemas = partition_schemas.pop()
    datasets = []

    # Now create a dataset per partition
    for p, (partition, fragments) in enumerate(sorted(ds_cfg.items())):
        fragments = list(sorted(fragments))
        column_arrays = defaultdict(list)
        fragment_rows = [f.metadata.num_rows for f in fragments]

        # Returns a dictionary of lists mapping fragments to partitions.
        partition_chunks = partition_chunking(p, fragment_rows, chunks)

        for pieces in partition_chunks.values():

            chunk_fragments = [fragments[i] for i, _ in pieces]
            chunk_ranges = [r for _, r in pieces]
            chunk_metas = [f.metadata for f in chunk_fragments]

            rows = sum(end - start for start, end in chunk_ranges)

            # NOTE(JSKenyon): This assumes that the schema/fields are
            # consistent between fragments. This should be ok.
            exemplar_schema = chunk_metas[0].schema.to_arrow_schema()
            exemplar_fields = {
                n: exemplar_schema.field(n)
                for n in exemplar_schema.names
            }

            for column, field in column_iterator(exemplar_fields, columns):
                field_metadata = field.metadata[DASKMS_METADATA.encode()]
                field_metadata = json.loads(field_metadata)
                dims = tuple(field_metadata["dims"])

                if isinstance(field.type, TensorType):
                    shape = (rows, ) + field.type.shape
                else:
                    shape = (rows, )

                assert len(shape) == len(dims)

                dtype = field.type.to_pandas_dtype()
                meta = np.empty((0, ) * len(dims), dtype)
                new_axes = {d: s for d, s in zip(dims, shape)}

                read = da.blockwise(fragment_reader,
                                    dims,
                                    chunk_fragments,
                                    None,
                                    chunk_ranges,
                                    None,
                                    column,
                                    None,
                                    shape,
                                    None,
                                    dtype,
                                    None,
                                    adjust_chunks={"row": rows},
                                    new_axes=new_axes,
                                    meta=meta)

                column_arrays[column].append((read, dims))

        data_vars = {}

        for column, values in column_arrays.items():
            arrays, array_dims = zip(*values)
            array_dims = set(array_dims)

            if not len(array_dims) == 1:
                raise ValueError(f"Inconsistent array dimensions "
                                 f"{array_dims} for {column}")

            data_vars[column] = (array_dims.pop(), da.concatenate(arrays))

        attrs = dict(partition)
        attrs[DASKMS_PARTITION_KEY] = partition_schemas
        datasets.append(Dataset(data_vars, attrs=attrs))

    return datasets
예제 #12
0
def xds_from_table(table_name,
                   columns=None,
                   index_cols=None,
                   group_cols=None,
                   **kwargs):
    """
    Create multiple :class:`xarray.Dataset` objects
    from CASA table ``table_name`` with the rows lexicographically
    sorted according to the columns in ``index_cols``.
    If ``group_cols`` is supplied, the table data is grouped into
    multiple :class:`xarray.Dataset` objects, each associated with a
    permutation of the unique values for the columns in ``group_cols``.

    Notes
    -----
    Both ``group_cols`` and ``index_cols`` should consist of
    columns that are part of the table index.

    However, this may not always be possible as CASA tables
    may not always contain indexing columns.
    The ``ANTENNA`` or ``SPECTRAL_WINDOW`` Measurement Set subtables
    are examples in which the ``row id`` serves as the index.

    Generally, calling

    .. code-block:: python

        antds = list(xds_from_table("WSRT.MS::ANTENNA"))

    is fine, since the data associated with each row of the ``ANTENNA``
    table has the same shape and so a dask or numpy array can be
    constructed around the contents of the table.

    This may not be the case for the ``SPECTRAL_WINDOW`` subtable.
    Here, each row defines a separate spectral window, but each
    spectral window may contain different numbers of frequencies.
    In this case, it is probably better to group the subtable
    by ``row``.

    There is a *special* group column :code:`"__row__"`
    that can be used to group the table by row.

    .. code-block:: python

        for spwds in xds_from_table("WSRT.MS::SPECTRAL_WINDOW",
                                            group_cols="__row__"):
            ...

    If :code:`"__row__"` is used for grouping, then no other
    column may be used. It should also only be used for *small*
    tables, as the number of datasets produced, may be prohibitively
    large.

    Parameters
    ----------
    table_name : str
        CASA table
    columns : list or tuple, optional
        Columns present on the returned dataset.
        Defaults to all if ``None``
    index_cols  : list or tuple, optional
        List of CASA table indexing columns. Defaults to :code:`()`.
    group_cols : list or tuple, optional
        List of columns on which to group the CASA table.
        Defaults to :code:`()`
    table_schema : dict or str or list of dict or str, optional
        A schema dictionary defining the dimension naming scheme for
        each column in the table. For example:

        .. code-block:: python

            {
                "UVW": {'dims': ('uvw',)},
                "DATA": {'dims': ('chan', 'corr')},
            }

        will result in the UVW and DATA arrays having dimensions
        :code:`('row', 'uvw')` and :code:`('row', 'chan', 'corr')`
        respectively.

        A string can be supplied, which will be matched
        against existing default schemas. Examples here include
        ``MS``, ``ANTENNA`` and ``SPECTRAL_WINDOW``
        corresponding to ``Measurement Sets`` the ``ANTENNA`` subtable
        and the ``SPECTRAL_WINDOW`` subtable, respectively.

        By default, the end of ``table_name`` will be
        inspected to see if it matches any default schemas.

        It is also possible to supply a list of strings or dicts defining
        a sequence of schemas which are combined. Later elements in the
        list override previous elements. In the following
        example, the standard UVW MS component name scheme is overridden
        with "my-uvw".

        .. code-block:: python

            ["MS", {"UVW": {'dims': ('my-uvw',)}}]

    table_keywords : {False, True}, optional
        If True, returns table keywords.
        Changes return type of the function into a tuple

    column_keywords : {False, True}, optional
        If True return keywords for each column on the table
        Changes return type of the function into a tuple

    table_proxy : {False, True}, optional
        If True returns the Table Proxy associated with the Dataset

    taql_where : str, optional
        TAQL where clause. For example, to exclude auto-correlations

        .. code-block:: python

            xds_from_table("WSRT.MS", taql_where="ANTENNA1 != ANTENNA2")

    chunks : list of dicts or dict, optional
        A :code:`{dim: chunk}` dictionary, specifying the chunking
        strategy of each dimension in the schema.
        Defaults to :code:`{'row': 100000 }` which will partition
        the row dimension into chunks of 100000.

        * If a dict, the chunking strategy is applied to each group.
        * If a list of dicts, each element is applied
          to the associated group. The last element is
          extended over the remaining groups if there
          are insufficient elements.

        It's also possible to specify the individual chunks for
        multiple dimensions:

        .. code-block:: python

            {'row': (40000, 60000, 40000, 60000),
             'chan': (16, 16, 16, 16),
             'corr': (1, 2, 1)}

        The above chunks a 200,000 row, 64 channel and 4 correlation
        space into 4 x 4 x 3 = 48 chunks, but requires prior
        knowledge of dimensionality, probably obtained with an
        initial call to :func:`xds_from_table`.

    Returns
    -------
    datasets : list of :class:`xarray.Dataset`
        datasets for each group, each ordered by indexing columns
    table_keywords : dict, optional
        Returned if ``table_keywords is True``
    column_keywords : dict, optional
        Returned if ``column_keywords is True``
    table_proxy : :class:`daskms.TableProxy`, optional
        Returned if ``table_proxy is True``


    """
    if isinstance(table_name, DaskMSStore):
        table_name = table_name.casa_path()
    else:
        store = DaskMSStore(table_name, **kwargs.pop("storage_options", {}))
        table_name = store.casa_path()

    columns = promote_columns(columns, [])
    index_cols = promote_columns(index_cols, [])
    group_cols = promote_columns(group_cols, [])

    return DatasetFactory(table_name, columns, group_cols, index_cols,
                          **kwargs).datasets()
예제 #13
0
def xds_from_zarr(store, columns=None, chunks=None, **kwargs):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset
    **kwargs: optional

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """

    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (Path, str)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_from_zarr: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY
    # expensive if the metadata has not been consolidated.
    zc.consolidate_metadata(store.map)
    table_path = store.table if store.table else "MAIN"
    table_group = zarr.open_consolidated(store.map)[table_path]

    for g, (group_name,
            group) in enumerate(sorted(table_group.groups(),
                                       key=group_sortkey)):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                group_chunks.update(chunks[-1])  # Reuse last chunking.
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets
예제 #14
0
def xds_to_zarr(xds, store, columns=None, rechunk=False, **kwargs):
    """
    Stores a dataset of list of datasets defined by `xds` in
    file location `store`.

    Parameters
    ----------
    xds : Dataset or list of Datasets
        Data
    store : str or Path
        Path to store the data
    columns : list of str or str or None
        Columns to store. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied. All coordinates
        associated with a specified column will be written automatically.
    rechunk : bool
        Controls whether dask arrays should be automatically rechunked to be
        consistent with existing on-disk zarr arrays while writing to disk.
    **kwargs : optional

    Returns
    -------
    writes : Dataset
        A Dataset representing the write operations
    """
    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (Path, str)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_to_zarr: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if isinstance(xds, Dataset):
        xds = [xds]
    elif isinstance(xds, (tuple, list)):
        if not all(isinstance(ds, Dataset) for ds in xds):
            raise TypeError("xds must be a Dataset or list of Datasets")
    else:
        raise TypeError("xds must be a Dataset or list of Datasets")

    write_datasets = []

    for di, ds in enumerate(xds):

        data_vars, coords = select_vars_and_coords(ds, columns)

        # Create a new ds which is consistent with what we want to write.
        ds = Dataset(data_vars, coords=coords, attrs=ds.attrs)

        ds, group = prepare_zarr_group(di, ds, store, rechunk=rechunk)

        data_vars = dict(_gen_writes(ds.data_vars, ds.chunks, group))
        # Include coords in the write dataset so they're reified
        data_vars.update(
            dict(_gen_writes(ds.coords, ds.chunks, group, indirect_dims=True)))

        # Transfer any partition information over to the write dataset
        partition = ds.attrs.get(DASKMS_PARTITION_KEY, False)

        if not partition:
            attrs = None
        else:
            attrs = {
                DASKMS_PARTITION_KEY: partition,
                **{k: getattr(ds, k)
                   for k, _ in partition}
            }

        write_datasets.append(Dataset(data_vars, attrs=attrs))

    return write_datasets
예제 #15
0
def xds_to_table(xds,
                 table_name,
                 columns="ALL",
                 descriptor=None,
                 table_keywords=None,
                 column_keywords=None,
                 table_proxy=False):
    """
    Generates a list of Datasets representing a write operations from the
    specified arrays in :class:`xarray.Dataset`'s into
    the CASA table columns specified by ``table_name`` and ``columns``.
    This is lazy operation -- it is only execute when a :meth:`dask.compute`
    or :meth:`xarray.Dataset.compute` method is called.

    Parameters
    ----------
    xds : :class:`xarray.Dataset` or list of :class:`xarray.Dataset`
        dataset(s) containing the specified columns. If a list of datasets
        is provided, the concatenation of the columns in
        sequential datasets will be written.

    table_name : str
        CASA table path

    columns : tuple or list or "ALL"
        list of column names to write to the table.

        "ALL" is a special marker which specifies that all columns
        should be written. If you wish to write an "ALL" array to
        a column, use :code:`columns=['ALL']`

    descriptor : None or \
        :class:`~daskms.descriptors.builder.AbstractBuilderFactory` or \
        str

        A class describing how CASA table descriptors and data managers
        are constructors. Some defaults are available such
        as `ms` and `ms_subtable`.

        If None, defaults are used.

    table_keywords : dict, optional
        Dictionary of table keywords to add to existing keywords.
        The operation is performed immediately, not lazily.

    column_keywords : dict, optional
        Dictionary of :code:`{column: keywords}` to add to existing
        column keywords.
        The operation is performed immediately, not lazily.

    table_proxy : {False, True}
        If True returns the table_proxy

    Returns
    -------
    write_datasets : list of :class:`xarray.Dataset`
        Datasets containing arrays representing write operations
        into a CASA Table
    table_proxy : :class:`daskms.TableProxy`, optional
        The Table Proxy associated with the datasets
    """
    if isinstance(table_name, DaskMSStore):
        store = table_name
    else:
        store = DaskMSStore(table_name)

    table_name = store.casa_path()

    # Promote dataset to a list
    if not isinstance(xds, (tuple, list)):
        xds = [xds]

    # Not writing to an existing dataset so we drop ROWID to ensure that rows
    # get added correctly. TODO: This may be a little brittle - we could
    # consider altering the functionality in writes.py.
    if not store.exists():
        xds = [ds.drop_vars("ROWID", errors="ignore") for ds in xds]

    if not isinstance(columns, (tuple, list)):
        if columns != "ALL":
            columns = [columns]

    # Write the datasets
    out_ds = write_datasets(table_name,
                            xds,
                            columns,
                            descriptor=descriptor,
                            table_keywords=table_keywords,
                            column_keywords=column_keywords,
                            table_proxy=table_proxy)

    # Unpack table proxy if it was requested
    if table_proxy is True:
        assert isinstance(out_ds, tuple)
        out_ds, tp = out_ds
        assert isinstance(tp, TableProxy)
    else:
        tp = None

    # Repack the Table Proxy
    if table_proxy is True:
        return out_ds, tp

    return out_ds
예제 #16
0
파일: convert.py 프로젝트: ska-sa/dask-ms
def _check_output_path(output: str):
    return DaskMSStore(output)
예제 #17
0
def test_xds_to_parquet_local(ms, tmp_path_factory, spw_table, ant_table):
    store = tmp_path_factory.mktemp("parquet_store") / "out.parquet"
    # antenna_store = store.parent / f"{store.name}::ANTENNA"
    # spw_store = store.parent / f"{store.name}::SPECTRAL_WINDOW"

    return parquet_tester(ms, DaskMSStore(store))
예제 #18
0
def xds_to_parquet(xds, store, columns=None, **kwargs):
    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (str, Path)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_to_parquet: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if isinstance(xds, Dataset):
        xds = [xds]
    elif isinstance(xds, (tuple, list)):
        if not all(isinstance(ds, Dataset) for ds in xds):
            raise TypeError("xds must be a Dataset or list of Datasets")
    else:
        raise TypeError("xds must be a Dataset or list of Datasets")

    datasets = []
    base_schema = ArrowSchema.from_datasets(xds)

    for ds_id, ds in enumerate(xds):
        arrow_schema = base_schema.with_attributes(ds)
        fragment = ParquetFragment(store, store.table, arrow_schema, ds_id)
        chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1)
        args = [chunk_ids, ("row", )]

        data_var_it = column_iterator(ds.data_vars, columns)
        coord_it = column_iterator(ds.coords, columns)

        for column, variable in itertools.chain(data_var_it, coord_it):
            if not isinstance(variable.data, da.Array):
                raise ValueError(f"Column {column} does not "
                                 f"contain a dask Array")

            if len(variable.dims[0]) == 0 or variable.dims[0] != "row":
                raise ValueError(f"Column {column} dimensions "
                                 f"{variable.dims} don't start with 'row'")

            args.extend((column, None, variable.data, variable.dims))

            for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]):
                if len(chunk) != 1:
                    raise ValueError(f"Chunking in {dim} is not yet "
                                     f"supported.")

        writes = da.blockwise(fragment.write, ("row", ),
                              *args,
                              align_arrays=False,
                              adjust_chunks={"row": 1},
                              meta=np.empty((0, ), bool))

        writes = inlined_array(writes, chunk_ids)

        # Transfer any partition information over to the write dataset
        partition = ds.attrs.get(DASKMS_PARTITION_KEY, False)

        if not partition:
            attrs = None
        else:
            attrs = {
                DASKMS_PARTITION_KEY: partition,
                **{k: getattr(ds, k)
                   for k, _ in partition}
            }

        datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs))

    return datasets