示例#1
0
def parquet_ms(ms, tmp_path_factory, request):

    parquet_store = tmp_path_factory.mktemp("parquet") / "test.parquet"

    # Chunk in row so we can probe chunk behaviour on reads.
    xdsl = xds_from_ms(ms, chunks={"row": request.param})

    writes = xds_to_parquet(xdsl, parquet_store)

    dask.compute(writes)  # Write to parquet.

    return parquet_store
示例#2
0
def test_xds_to_parquet(ms, tmp_path_factory, spw_table, ant_table):
    store = tmp_path_factory.mktemp("parquet_store") / "out.parquet"
    # antenna_store = store.parent / f"{store.name}::ANTENNA"
    # spw_store = store.parent / f"{store.name}::SPECTRAL_WINDOW"

    datasets = xds_from_ms(ms)

    # We can test row chunking if xarray is installed
    if xarray is not None:
        datasets = [ds.chunk({"row": 1}) for ds in datasets]

    # spw_datasets = xds_from_table(spw_table, group_cols="__row__")
    # ant_datasets = xds_from_table(ant_table, group_cols="__row__")

    writes = []
    writes.extend(xds_to_parquet(datasets, store))
    # TODO(sjperkins)
    # Fix arrow shape unification errors
    # writes.extend(xds_to_parquet(spw_datasets, spw_store))
    # writes.extend(xds_to_parquet(ant_datasets, antenna_store))
    dask.compute(writes)

    pq_datasets = xds_from_parquet(store, chunks={"row": 1})
    assert len(datasets) == len(pq_datasets)

    for ds, pq_ds in zip(datasets, pq_datasets):
        for column, var in ds.data_vars.items():
            pq_var = getattr(pq_ds, column)
            assert_array_equal(var.data, pq_var.data)
            assert var.dims == pq_var.dims

        for column, var in ds.coords.items():
            pq_var = getattr(pq_ds, column)
            assert_array_equal(var.data, pq_var.data)
            assert var.dims == pq_var.dims

        partitions = ds.attrs[DASKMS_PARTITION_KEY]
        pq_partitions = pq_ds.attrs[DASKMS_PARTITION_KEY]
        assert partitions == pq_partitions

        for field, dtype in partitions:
            assert getattr(ds, field) == getattr(pq_ds, field)
示例#3
0
def test_xds_to_parquet_string(tmp_path_factory):
    store = tmp_path_factory.mktemp("parquet_store") / "string-dataset.parquet"

    datasets = []

    for i in range(3):
        names = random.choices([f"foo-{i}", f"bar-{i}", f"qux-{i}"], k=10)
        names = np.asarray(names, dtype=object)
        chunks = sorted([1, 2, 3, 4], key=lambda *a: random.random())
        names = da.from_array(names, chunks=chunks)
        datasets.append(Dataset({"NAME": (("row", ), names)}))

    writes = xds_to_parquet(datasets, store)
    dask.compute(writes)

    parquet_datasets = xds_from_parquet(store)
    assert len(datasets) == len(parquet_datasets)

    for ds, pq_ds in zip(datasets, parquet_datasets):
        assert_array_equal(ds.NAME.data, pq_ds.NAME.data)
示例#4
0
def parquet_tester(ms, store):
    datasets = xds_from_storage_ms(ms)

    # We can test row chunking if xarray is installed
    if xarray is not None:
        datasets = [ds.chunk({"row": 1}) for ds in datasets]

    # spw_datasets = xds_from_table(spw_table, group_cols="__row__")
    # ant_datasets = xds_from_table(ant_table, group_cols="__row__")

    writes = xds_to_parquet(datasets,
                            store.url,
                            storage_options=store.storage_options)
    # TODO(sjperkins)
    # Fix arrow shape unification errors
    # writes.extend(xds_to_parquet(spw_datasets, spw_store))
    # writes.extend(xds_to_parquet(ant_datasets, antenna_store))
    dask.compute(writes)

    pq_datasets = xds_from_parquet(store, chunks={"row": 1})
    assert len(datasets) == len(pq_datasets)

    for ds, pq_ds in zip(datasets, pq_datasets):
        for column, var in ds.data_vars.items():
            pq_var = getattr(pq_ds, column)
            assert_array_equal(var.data, pq_var.data)
            assert var.dims == pq_var.dims

        for column, var in ds.coords.items():
            pq_var = getattr(pq_ds, column)
            assert_array_equal(var.data, pq_var.data)
            assert var.dims == pq_var.dims

        partitions = ds.attrs[DASKMS_PARTITION_KEY]
        pq_partitions = pq_ds.attrs[DASKMS_PARTITION_KEY]
        assert partitions == pq_partitions

        for field, dtype in partitions:
            assert getattr(ds, field) == getattr(pq_ds, field)