def test_concat_metapartition_wrong_types(df_all_types): mp1 = MetaPartition(label="first", data=df_all_types, metadata_version=4) df_corrupt = df_all_types.copy() df_corrupt["int8"] = "NoInteger" mp2 = MetaPartition(label="second", data=df_corrupt, metadata_version=4) with pytest.raises(ValueError, match="Schema violation"): MetaPartition.concat_metapartitions([mp1, mp2])
def test_concat_metapartition_different_partitioning(df_all_types): mp1 = MetaPartition( label="int8=1/1234", data=df_all_types, metadata_version=4, partition_keys=["int8"], ) mp2 = MetaPartition( label="float8=1.0/4321", data=df_all_types, metadata_version=4, partition_keys=["float8"], ) with pytest.raises(ValueError, match="Schema violation"): MetaPartition.concat_metapartitions([mp1, mp2])
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, columns=None, predicate_pushdown_to_io=True, categoricals=None, dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, predicates=predicates, dispatch_by=dispatch_by, ) for mp in mps: if dispatch_by is not None: mp = MetaPartition.concat_metapartitions( [ mp_inner.load_dataframes( store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ] ) else: mp = cast(MetaPartition, mp) mp = mp.load_dataframes( store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp
def test_concat_metapartition(df_all_types): mp1 = MetaPartition(label="first", data=df_all_types, metadata_version=4) mp2 = MetaPartition(label="second", data=df_all_types, metadata_version=4) new_mp = MetaPartition.concat_metapartitions([mp1, mp2]) # what the label actually is, doesn't matter so much assert new_mp.label is not None df_expected = pd.concat([df_all_types, df_all_types]) df_actual = new_mp.data pdt.assert_frame_equal(df_actual, df_expected)
def test_concat_metapartition_categoricals(df_all_types): mp1 = MetaPartition( label="first", data=pd.DataFrame({"a": [0, 0], "b": ["a", "a"]}, dtype="category"), metadata_version=4, partition_keys=["a"], ) mp2 = MetaPartition( label="second", data=pd.DataFrame({"a": [1, 1], "b": ["a", "b"]}, dtype="category"), metadata_version=4, partition_keys=["a"], ) new_mp = MetaPartition.concat_metapartitions([mp1, mp2]) assert new_mp.table_name == "table" assert pd.api.types.is_categorical_dtype(new_mp.data["b"].dtype)
def test_concat_metapartition_partitioned(df_all_types): mp1 = MetaPartition( label="int8=1/1234", data=df_all_types, metadata_version=4, partition_keys=["int8"], ) mp2 = MetaPartition( label="int8=1/4321", data=df_all_types, metadata_version=4, partition_keys=["int8"], ) new_mp = MetaPartition.concat_metapartitions([mp1, mp2]) df_expected = pd.concat([df_all_types, df_all_types]) df_actual = new_mp.data pdt.assert_frame_equal(df_actual, df_expected) assert new_mp.partition_keys == ["int8"]
def _load_and_concat_metapartitions_inner(mps, args, kwargs): return MetaPartition.concat_metapartitions( [mp.load_dataframes(*args, **kwargs) for mp in mps])
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, ) for mp in mps: if concat_partitions_on_primary_index: mp = MetaPartition.concat_metapartitions([ mp_inner.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ]) else: mp = mp.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, dispatch_metadata=True, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, dispatch_metadata=dispatch_metadata, ) for mp in mps: if concat_partitions_on_primary_index or dispatch_by is not None: mp = MetaPartition.concat_metapartitions( [ mp_inner.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ] ) else: mp = cast(MetaPartition, mp) mp = mp.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp