예제 #1
0
def create_empty_dataset_header(
    store,
    dataset_uuid,
    table_meta,
    partition_on=None,
    metadata=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Create an dataset header without any partitions. This may be used in combination
    with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets.

    .. note::

        The created dataset will **always** have explicit_partition==False

    .. warning::

        This function should only be used in very rare occasions. Usually you're better off using
        full end-to-end pipelines.

    Parameters
    ----------
    """
    store = _make_callable(store)()
    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    for table, schema in table_meta.items():
        table_meta[table] = make_meta(schema,
                                      origin=table,
                                      partition_keys=partition_on)
        store_schema_metadata(
            schema=table_meta[table],
            dataset_uuid=dataset_uuid,
            store=store,
            table=table,
        )
    dataset_builder = DatasetMetadataBuilder(
        uuid=dataset_uuid,
        metadata_version=metadata_version,
        partition_keys=partition_on,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    if metadata:
        for key, value in metadata.items():
            dataset_builder.add_metadata(key, value)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    return dataset_builder.to_dataset()
예제 #2
0
def test_schema_roundtrip(df_all_types, store):
    expected_meta = make_meta(df_all_types, origin="df_all_types")
    store_schema_metadata(
        expected_meta, dataset_uuid="dataset_uuid", store=store, table="table"
    )
    result = read_schema_metadata(
        dataset_uuid="dataset_uuid", store=store, table="table"
    )
    assert result == expected_meta
예제 #3
0
def test_dynamic_partitions_quote(store, metadata_version):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    dataset_uuid = "uuid-namespace-attribute12_underscored"
    partition0_core = create_partition_key(dataset_uuid, "core",
                                           [("location", "München")],
                                           "data.parquet")
    partition1_core = create_partition_key(dataset_uuid, "core",
                                           [("location", "å\\ øß")],
                                           "data.parquet")
    metadata = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": dataset_uuid,
    }
    expected_partitions = {
        "location=M%C3%BCnchen/data": {
            "files": {
                "core": partition0_core
            }
        },
        "location=%C3%A5%5C%20%C3%B8%C3%9F/data": {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "München": ["location=M%C3%BCnchen/data"],
            "å\\ øß": ["location=%C3%A5%5C%20%C3%B8%C3%9F/data"],
        }
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()

    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    assert dmd_dict["indices"] == expected_indices
예제 #4
0
def test_overlap_keyspace(store, metadata_version):
    dataset_uuid1 = "uuid+namespace-attribute12_underscored"
    dataset_uuid2 = "uuid+namespace-attribute12_underscored_ext"
    table = "core"

    for dataset_uuid in (dataset_uuid1, dataset_uuid2):
        partition0 = "location=L-0".format(dataset_uuid)
        partition0_key = "{}/{}/{}/data.parquet".format(
            dataset_uuid, table, partition0)
        metadata = {
            "dataset_metadata_version": metadata_version,
            "dataset_uuid": dataset_uuid,
        }

        # put two partitions for two tables each to store
        store.put(
            "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
            simplejson.dumps(metadata).encode("utf-8"),
        )
        store.put(partition0_key, b"test")
        store_schema_metadata(
            make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"),
            dataset_uuid,
            store,
            "core",
        )

    for dataset_uuid in (dataset_uuid1, dataset_uuid2):
        partition0_label = "location=L-0/data".format(dataset_uuid)
        partition0_key = "{}/{}/{}.parquet".format(dataset_uuid, table,
                                                   partition0_label)
        expected_partitions = {
            "location=L-0/data": {
                "files": {
                    "core": partition0_key
                }
            }
        }
        expected_indices = {"location": {"L-0": ["location=L-0/data"]}}
        assert DatasetMetadata.storage_keys(dataset_uuid, store) == [
            "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
            _get_common_metadata_key(dataset_uuid, "core"),
            partition0_key,
        ]
        dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
        dmd = dmd.load_partition_indices()
        dmd_dict = dmd.to_dict()
        assert dmd_dict["partitions"] == expected_partitions
        assert dmd_dict["indices"] == expected_indices
예제 #5
0
def test_store_schema_metadata(store, df_all_types):
    store_schema_metadata(
        schema=make_meta(df_all_types, origin="df_all_types"),
        dataset_uuid="some_uuid",
        store=store,
        table="some_table",
    )

    key = "some_uuid/some_table/_common_metadata"
    assert key in store.keys()
    pq_file = pq.ParquetFile(store.open(key))
    actual_schema = pq_file.schema.to_arrow_schema()
    fields = [
        pa.field("array_float32", pa.list_(pa.float64())),
        pa.field("array_float64", pa.list_(pa.float64())),
        pa.field("array_int16", pa.list_(pa.int64())),
        pa.field("array_int32", pa.list_(pa.int64())),
        pa.field("array_int64", pa.list_(pa.int64())),
        pa.field("array_int8", pa.list_(pa.int64())),
        pa.field("array_uint16", pa.list_(pa.uint64())),
        pa.field("array_uint32", pa.list_(pa.uint64())),
        pa.field("array_uint64", pa.list_(pa.uint64())),
        pa.field("array_uint8", pa.list_(pa.uint64())),
        pa.field("array_unicode", pa.list_(pa.string())),
        pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
        pa.field("int64", pa.int64()),
        pa.field("int8", pa.int64()),
        pa.field("null", pa.null()),
        pa.field("uint16", pa.uint64()),
        pa.field("uint32", pa.uint64()),
        pa.field("uint64", pa.uint64()),
        pa.field("uint8", pa.uint64()),
        pa.field("unicode", pa.string()),
    ]
    if not ARROW_LARGER_EQ_0130:
        fields.append(pa.field("__index_level_0__", pa.int64()))
    expected_schema = pa.schema(fields)

    assert actual_schema.remove_metadata() == expected_schema
예제 #6
0
def test_load_from_store_with_indices(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid",
        "partitions": {
            "product_id=1/part_1": {
                "files": {
                    "core_data":
                    "dataset_uuid/table/location_id=1/part_1.parquet"
                }
            }
        },
        "indices": {
            "product_id": {
                "1": ["part_1"],
                "2": ["part_1"],
                "100": ["part_1"],
                "34": ["part_1"],
            }
        },
    }
    store.put("uuid.by-dataset-metadata.json",
              simplejson.dumps(meta_dct).encode("utf-8"))
    df = pd.DataFrame({"index": [1], "location_id": [1], "product_id": [1]})
    store_schema_metadata(make_meta(df, origin="core"), "uuid", store,
                          "core_data")

    storage_key = "uuid/some_index.parquet"
    index2 = ExplicitSecondaryIndex(
        column="location_id",
        index_dct={
            1: ["part_1", "part_2"],
            3: ["part_3"]
        },
        index_storage_key=storage_key,
        dtype=pa.int64(),
    )
    index2.store(store, "dataset_uuid")

    dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid")
    assert "location_id" not in dmd.indices

    dmd = DatasetMetadata.load_from_store(store=store,
                                          uuid="uuid",
                                          load_all_indices=True)
    assert "location_id" in dmd.indices
예제 #7
0
def test_load_partition_indices_types(store):
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    table = "table"
    index_name = "location_id"
    index_value = 1
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": dataset_uuid,
        "partitions": {
            "{index_name}={index_value}/part_1".format(index_name=index_name,
                                                       index_value=index_value):
            {
                "files": {
                    table:
                    "{dataset_uuid}/{table}/location_id=1/part_1.parquet".
                    format(dataset_uuid=dataset_uuid, table=table)
                }
            }
        },
    }
    store.put(
        "{dataset_uuid}.by-dataset-metadata.json".format(
            dataset_uuid=dataset_uuid),
        simplejson.dumps(meta_dct).encode(),
    )
    store_schema_metadata(
        make_meta(
            pd.DataFrame({index_name: pd.Series([index_value], dtype=int)}),
            origin="core",
        ),
        dataset_uuid,
        store,
        table,
    )
    dmd = DatasetMetadata.load_from_store(store=store, uuid=dataset_uuid)

    dmd = dmd.load_partition_indices()
    assert len(dmd.indices) == 1

    assert "location_id" in dmd.indices
    assert isinstance(dmd.indices["location_id"], PartitionIndex)

    idx = dmd.indices["location_id"]
    assert idx.dtype == pa.int64()
    assert idx.query(1) == ["location_id=1/part_1"]
예제 #8
0
def test_load_partition_keys(store):
    expected = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid",
        "partitions": {
            "part_1": {
                "files": {
                    "core_data": "uuid/table/index=1/index2=2/file.parquet"
                }
            },
            "part_2": {
                "files": {
                    "core_data": "uuid/table/index=1/index2=2/file2.parquet"
                }
            },
        },
        "indices": {
            "product_id": {
                "1": ["part_1"],
                "2": ["part_2"],
                "100": ["part_1", "part_2"],
                "34": ["part_1"],
            },
            "location_id": {
                "1": ["part_1"],
                "2": ["part_2"],
                "3": ["part_1"],
                "4": ["part_2"],
            },
        },
    }
    store.put("uuid.by-dataset-metadata.json",
              simplejson.dumps(expected).encode("utf-8"))
    df = pd.DataFrame({
        "index": [1],
        "index2": [1],
        "product_id": [1],
        "location_id": [1]
    })
    store_schema_metadata(make_meta(df, origin="core"), "uuid", store,
                          "core_data")
    dmd = DatasetMetadata.load_from_store("uuid", store)
    assert dmd.partition_keys == ["index", "index2"]
예제 #9
0
파일: write.py 프로젝트: x-malet/kartothek
def persist_common_metadata(partition_list, update_dataset, store,
                            dataset_uuid):
    # hash the schemas for quick equality check with possible false negatives
    # (e.g. other pandas version or null schemas)
    tm_dct = defaultdict(set)
    for mp in partition_list:
        for tab, tm in mp.table_meta.items():
            tm_dct[tab].add(tm)

    if update_dataset:
        if set(tm_dct.keys()) and set(update_dataset.tables) != set(
                tm_dct.keys()):
            raise ValueError((
                "Input partitions for update have different tables than dataset:\n"
                "Input partition tables: {}\n"
                "Tables of existing dataset: {}").format(
                    set(tm_dct.keys()), update_dataset.tables))
        for table in update_dataset.tables:
            tm_dct[table].add(
                read_schema_metadata(dataset_uuid=dataset_uuid,
                                     store=store,
                                     table=table))

    result = {}

    # sort tables and schemas to have reproducible error messages
    for table in sorted(tm_dct.keys()):
        schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin))
        try:
            result[table] = validate_compatible(schemas)
        except ValueError as e:
            raise ValueError(
                "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}"
                .format(table=table, dataset_uuid=dataset_uuid, e=e))

    validate_shared_columns(list(result.values()))

    for table, schema in result.items():
        store_schema_metadata(schema=schema,
                              dataset_uuid=dataset_uuid,
                              store=store,
                              table=table)
    return result
예제 #10
0
def test_compat_old_rw_path(df_all_types, store):
    # strip down DF before some column types weren't supported before anyway
    df = df_all_types[
        [
            c
            for c in df_all_types.columns
            if (
                not c.startswith("array_")  # array types (always null)
                and c != "unicode"  # unicode type (alway null)
                and "8" not in c  # 8 bit types are casted to 64 bit
                and "16" not in c  # 16 bit types are casted to 64 bit
                and "32" not in c  # 32 bit types are casted to 64 bit
            )
        ]
    ]
    expected_meta = make_meta(df, origin="df")

    # old schema write path
    old_meta = dask_make_meta(df)
    pa_table = pa.Table.from_pandas(old_meta)
    buf = pa.BufferOutputStream()
    pq.write_table(pa_table, buf, version="2.0")
    key_old = _get_common_metadata_key("dataset_uuid_old", "table")
    store.put(key_old, buf.getvalue().to_pybytes())

    actual_meta = read_schema_metadata(
        dataset_uuid="dataset_uuid_old", store=store, table="table"
    )
    validate_compatible([actual_meta, expected_meta])

    store_schema_metadata(
        schema=make_meta(df, origin="df"),
        dataset_uuid="dataset_uuid_new",
        store=store,
        table="table",
    )
    key_new = _get_common_metadata_key("dataset_uuid_new", "table")
    actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store)
    actual_df["date"] = actual_df["date"].dt.date
    pdt.assert_frame_equal(actual_df, old_meta)
예제 #11
0
def test_read_table_meta(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "dataset_uuid",
        "partitions": {
            "location_id=1/part_1": {
                "files": {
                    "table1":
                    "dataset_uuid/table1/location_id=1/part_1.parquet"
                }
            }
        },
    }
    df1 = pd.DataFrame({
        "location_id": pd.Series([1], dtype=int),
        "x": pd.Series([True], dtype=bool)
    })
    schema1 = make_meta(df1, origin="1")
    store_schema_metadata(schema1, "dataset_uuid", store, "table1")

    dmd = DatasetMetadata.load_from_dict(meta_dct, store)

    assert dmd.schema == schema1
def test_reconstruct_index_duplicates(store):
    ser = ParquetSerializer()
    df = pd.DataFrame({"index_col": [1, 1], "column": list("ab")})

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store)

    mp = MetaPartition(
        label="dontcare",
        file=key,
        metadata_version=4,
        schema=schema,
        partition_keys=["index_col"],
    )
    mp = mp.load_dataframes(store)
    df_actual = mp.data
    df_expected = pd.DataFrame(
        OrderedDict([("index_col", [2, 2]), ("column", list("ab"))]))
    pdt.assert_frame_equal(df_actual, df_expected)
예제 #13
0
def test_partition_on_roundtrip(store):
    original_df = pd.DataFrame(
        OrderedDict([("test", [1, 2, 3]), ("some_values", [1, 2, 3])]))
    mp = MetaPartition(
        label="label_1",
        data={"core": original_df},
        dataset_metadata={"dataset": "metadata"},
        metadata_version=4,
    )

    new_mp = mp.partition_on(["test"])
    new_mp = new_mp.store_dataframes(store=store, dataset_uuid="some_uuid")
    store_schema_metadata(new_mp.table_meta["core"], "some_uuid", store,
                          "core")
    # Test immediately after dropping and later once with new metapartition to check table meta reloading
    new_mp = new_mp.load_dataframes(store=store)
    assert len(new_mp.metapartitions) == 3
    dfs = []
    for internal_mp in new_mp:
        dfs.append(internal_mp.data["core"])
    actual_df = pd.concat(dfs).sort_values(by="test").reset_index(drop=True)
    pdt.assert_frame_equal(original_df, actual_df)

    for i in range(1, 4):
        # Check with fresh metapartitions
        new_mp = MetaPartition(
            label="test={}/label_1".format(i),
            files={"core": "some_uuid/core/test={}/label_1.parquet".format(i)},
            metadata_version=4,
        )
        new_mp = new_mp.load_dataframes(store=store)

        actual_df = new_mp.data["core"]

        expected_df = pd.DataFrame(
            OrderedDict([("test", [i]), ("some_values", [i])]))
        pdt.assert_frame_equal(expected_df, actual_df)
예제 #14
0
def test_reconstruct_date_index(store, metadata_version, dates_as_object):
    ser = ParquetSerializer()
    # If the parquet file does include the primary index col, still use the reconstructed index and ignore the content of the file
    df = pd.DataFrame(
        {"index_col": [date(2018, 6, 1), date(2018, 6, 1)], "column": list("ab")}
    )

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2018-06-02/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store)

    mp = MetaPartition(
        label="dontcare",
        file=key,
        metadata_version=metadata_version,
        schema=schema,
        partition_keys=["index_col"],
    )

    mp = mp.load_dataframes(store, dates_as_object=dates_as_object)
    df_actual = mp.data
    if dates_as_object:
        dt_constructor = date
    else:
        dt_constructor = datetime
    df_expected = pd.DataFrame(
        OrderedDict(
            [
                ("index_col", [dt_constructor(2018, 6, 2), dt_constructor(2018, 6, 2)]),
                ("column", list("ab")),
            ]
        )
    )
    pdt.assert_frame_equal(df_actual, df_expected)
예제 #15
0
def test_dynamic_partitions(store):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    partition_suffix = "suffix"
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1")],
        "{}.parquet".format(partition_suffix),
    )
    partition0_ext = create_partition_key(
        dataset_uuid,
        "extension",
        [("location", "L-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_ext = create_partition_key(
        dataset_uuid,
        "extension",
        [("location", "L-1")],
        "{}.parquet".format(partition_suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/{}".format(partition_suffix): {
            "files": {
                "core": partition0_core,
                "extension": partition0_ext
            }
        },
        "location=L-1/{}".format(partition_suffix): {
            "files": {
                "core": partition1_core,
                "extension": partition1_ext
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/{}".format(partition_suffix)],
            "L-1": ["location=L-1/{}".format(partition_suffix)],
        }
    }

    # put two partitions for two tables each to store
    store.put(
        "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
        simplejson.dumps(metadata).encode("utf-8"),
    )
    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store.put(partition0_ext, b"test")
    store.put(partition1_ext, b"test")
    store_schema_metadata(
        make_meta(
            pd.DataFrame({"location": ["L-0/{}".format(partition_suffix)]}),
            origin="stored",
        ),
        dataset_uuid,
        store,
        "core",
    )

    # instantiate metadata to write table metadatad
    core_schema = make_meta(
        pd.DataFrame({
            "column_0": pd.Series([1], dtype=int),
            "column_1": pd.Series([1], dtype=int),
            "location": pd.Series(["str"]),
        }),
        origin="core",
    )
    extension_schema = make_meta(
        pd.DataFrame({
            "column_77": pd.Series([1], dtype=int),
            "column_78": pd.Series([1], dtype=int),
            "location": pd.Series(["str"]),
        }),
        origin="extension",
    )
    store_schema_metadata(core_schema, dataset_uuid, store, "core")
    store_schema_metadata(extension_schema, dataset_uuid, store, "extension")
    dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
    # reload metadata to use table metadata
    dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
    dmd = dmd.load_partition_indices()

    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    assert dmd_dict["indices"] == expected_indices
예제 #16
0
def test_query_indices_external(store, metadata_version):
    expected = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": "uuid+namespace-attribute12_underscored",
        "partitions": {
            "part_1": {
                "files": {
                    "core_data": "file.parquest"
                }
            },
            "part_2": {
                "files": {
                    "core_data": "file2.parquest"
                }
            },
        },
        "indices": {
            "product_id":
            "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet",
            "location_id": {
                "1": ["part_1"],
                "2": ["part_2"],
                "3": ["part_1"],
                "4": ["part_2"],
            },
        },
    }
    store.put(
        "uuid+namespace-attribute12_underscored.by-dataset-metadata.json",
        simplejson.dumps(expected).encode("utf-8"),
    )
    df = pd.DataFrame({
        "product_id": [1, 2, 100, 34],
        "partition": [
            np.array(["part_1"], dtype=object),
            np.array(["part_2"], dtype=object),
            np.array(["part_1", "part_2"], dtype=object),
            np.array(["part_1"], dtype=object),
        ],
    })
    schema = pa.schema([
        pa.field("partition", pa.list_(pa.string())),
        pa.field("product_id", pa.int64()),
    ])
    table = pa.Table.from_pandas(df, schema=schema)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(
        "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet",
        buf.getvalue().to_pybytes(),
    )
    store_schema_metadata(
        make_meta(df, origin="core"),
        "uuid+namespace-attribute12_underscored",
        store,
        "core_data",
    )

    dmd = DatasetMetadata.load_from_store(
        "uuid+namespace-attribute12_underscored", store)

    dmd = dmd.load_index("product_id", store)
    assert dmd.query(product_id=2) == ["part_2"]
    dmd = dmd.load_all_indices(store)
    assert dmd.query(product_id=2, location_id=2) == ["part_2"]
    assert dmd.query(product_id=100, location_id=3) == ["part_1"]
    assert dmd.query(product_id=2, location_id=2,
                     something_else="bla") == ["part_2"]

    additional_index = ExplicitSecondaryIndex.from_v2(
        "another_column", {"1": ["part_2", "part_3"]})
    assert dmd.query(indices=[additional_index],
                     another_column="1",
                     product_id=2,
                     location_id=2) == ["part_2"]
예제 #17
0
def test_dynamic_partitions_with_garbage(store):
    """
    In case there are unknown files, dataset and indices still load correctly
    """
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition_suffix = "suffix"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0"), ("product", "P-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1"), ("product", "P-0")],
        "{}.parquet".format(partition_suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/product=P-0/{}".format(partition_suffix): {
            "files": {
                "core": partition0_core
            }
        },
        "location=L-1/product=P-0/{}".format(partition_suffix): {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/product=P-0/{}".format(partition_suffix)],
            "L-1": ["location=L-1/product=P-0/{}".format(partition_suffix)],
        },
        "product": {
            "P-0": [
                "location=L-0/product=P-0/{}".format(partition_suffix),
                "location=L-1/product=P-0/{}".format(partition_suffix),
            ]
        },
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({
            "location": ["L-0"],
            "product": ["P-0"]
        }),
                  origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    # the following files are garbage and should not interfere with the indices and/or partitions
    for suffix in ["", ".json", ".msgpack", ".my_own_file_format"]:
        store.put("this_should_not_exist{}".format(suffix), b"ignore me")
        store.put("{}/this_should_not_exist{}".format(dataset_uuid, suffix),
                  b"ignore me")
        store.put(
            "{}/{}/this_should_not_exist{}".format(dataset_uuid, "core",
                                                   suffix),
            b"ignore me",
        )
        store.put(
            "{}/{}/location=L-0/this_should_not_exist{}".format(
                dataset_uuid, "core", suffix),
            b"ignore me",
        )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()
    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    # Sorting may differ in the index list. This is ok for runtime
    # but does produce flaky tests thus sort them.
    sorted_result = {
        column: {label: sorted(x)
                 for label, x in index.items()}
        for column, index in dmd_dict["indices"].items()
    }
    assert sorted_result == expected_indices
예제 #18
0
def test_dynamic_partitions_multiple_indices(store):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    suffix = "suffix"
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0"), ("product", "P-0")],
        "{}.parquet".format(suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1"), ("product", "P-0")],
        "{}.parquet".format(suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/product=P-0/{}".format(suffix): {
            "files": {
                "core": partition0_core
            }
        },
        "location=L-1/product=P-0/{}".format(suffix): {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/product=P-0/{}".format(suffix)],
            "L-1": ["location=L-1/product=P-0/{}".format(suffix)],
        },
        "product": {
            "P-0": [
                "location=L-0/product=P-0/{}".format(suffix),
                "location=L-1/product=P-0/{}".format(suffix),
            ]
        },
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({
            "location": ["L-0"],
            "product": ["P-0"]
        }),
                  origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()
    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    # Sorting may differ in the index list. This is ok for runtime
    # but does produce flaky tests thus sort them.
    sorted_result = {
        column: {label: sorted(x)
                 for label, x in index.items()}
        for column, index in dmd_dict["indices"].items()
    }
    assert sorted_result == expected_indices