def parquet_ms(ms, tmp_path_factory, request): parquet_store = tmp_path_factory.mktemp("parquet") / "test.parquet" # Chunk in row so we can probe chunk behaviour on reads. xdsl = xds_from_ms(ms, chunks={"row": request.param}) writes = xds_to_parquet(xdsl, parquet_store) dask.compute(writes) # Write to parquet. return parquet_store
def test_xds_to_parquet(ms, tmp_path_factory, spw_table, ant_table): store = tmp_path_factory.mktemp("parquet_store") / "out.parquet" # antenna_store = store.parent / f"{store.name}::ANTENNA" # spw_store = store.parent / f"{store.name}::SPECTRAL_WINDOW" datasets = xds_from_ms(ms) # We can test row chunking if xarray is installed if xarray is not None: datasets = [ds.chunk({"row": 1}) for ds in datasets] # spw_datasets = xds_from_table(spw_table, group_cols="__row__") # ant_datasets = xds_from_table(ant_table, group_cols="__row__") writes = [] writes.extend(xds_to_parquet(datasets, store)) # TODO(sjperkins) # Fix arrow shape unification errors # writes.extend(xds_to_parquet(spw_datasets, spw_store)) # writes.extend(xds_to_parquet(ant_datasets, antenna_store)) dask.compute(writes) pq_datasets = xds_from_parquet(store, chunks={"row": 1}) assert len(datasets) == len(pq_datasets) for ds, pq_ds in zip(datasets, pq_datasets): for column, var in ds.data_vars.items(): pq_var = getattr(pq_ds, column) assert_array_equal(var.data, pq_var.data) assert var.dims == pq_var.dims for column, var in ds.coords.items(): pq_var = getattr(pq_ds, column) assert_array_equal(var.data, pq_var.data) assert var.dims == pq_var.dims partitions = ds.attrs[DASKMS_PARTITION_KEY] pq_partitions = pq_ds.attrs[DASKMS_PARTITION_KEY] assert partitions == pq_partitions for field, dtype in partitions: assert getattr(ds, field) == getattr(pq_ds, field)
def test_xds_to_parquet_string(tmp_path_factory): store = tmp_path_factory.mktemp("parquet_store") / "string-dataset.parquet" datasets = [] for i in range(3): names = random.choices([f"foo-{i}", f"bar-{i}", f"qux-{i}"], k=10) names = np.asarray(names, dtype=object) chunks = sorted([1, 2, 3, 4], key=lambda *a: random.random()) names = da.from_array(names, chunks=chunks) datasets.append(Dataset({"NAME": (("row", ), names)})) writes = xds_to_parquet(datasets, store) dask.compute(writes) parquet_datasets = xds_from_parquet(store) assert len(datasets) == len(parquet_datasets) for ds, pq_ds in zip(datasets, parquet_datasets): assert_array_equal(ds.NAME.data, pq_ds.NAME.data)
def parquet_tester(ms, store): datasets = xds_from_storage_ms(ms) # We can test row chunking if xarray is installed if xarray is not None: datasets = [ds.chunk({"row": 1}) for ds in datasets] # spw_datasets = xds_from_table(spw_table, group_cols="__row__") # ant_datasets = xds_from_table(ant_table, group_cols="__row__") writes = xds_to_parquet(datasets, store.url, storage_options=store.storage_options) # TODO(sjperkins) # Fix arrow shape unification errors # writes.extend(xds_to_parquet(spw_datasets, spw_store)) # writes.extend(xds_to_parquet(ant_datasets, antenna_store)) dask.compute(writes) pq_datasets = xds_from_parquet(store, chunks={"row": 1}) assert len(datasets) == len(pq_datasets) for ds, pq_ds in zip(datasets, pq_datasets): for column, var in ds.data_vars.items(): pq_var = getattr(pq_ds, column) assert_array_equal(var.data, pq_var.data) assert var.dims == pq_var.dims for column, var in ds.coords.items(): pq_var = getattr(pq_ds, column) assert_array_equal(var.data, pq_var.data) assert var.dims == pq_var.dims partitions = ds.attrs[DASKMS_PARTITION_KEY] pq_partitions = pq_ds.attrs[DASKMS_PARTITION_KEY] assert partitions == pq_partitions for field, dtype in partitions: assert getattr(ds, field) == getattr(pq_ds, field)