def _map_ktk_mps_to_groups(cube, datasets, label2gp): """ Map Kartothek metapartitions to groups. Parameters ---------- cube: Cube Cube specification. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are processed by the regrouper. label2gp: Dict[str, Dict[str, Tuple[int, int]]] Maps "dataset ID -> (label -> (group ID, partition ID))". Returns ------- groups: Dict[int, Dict[int, Dict[str, Tuple[kartothek.io_components.metapartition.MetaPartition, ...]]]] Maps "group ID -> (partition ID -> (dataset ID -> list of MetaPartitions))" """ groups = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) for ktk_cube_dataset_id, ds in datasets.items(): label2gp_sub = label2gp[ktk_cube_dataset_id] for mp in dispatch_metapartitions_from_factory( dataset_factory=metadata_factory_from_dataset(ds), ): # FIXME: can this be simplified? if mp.label not in label2gp_sub: # filtered out by pre-condition continue for group_id, partition_id in label2gp_sub[mp.label]: groups[group_id][partition_id][ktk_cube_dataset_id].append(mp) return groups
def delete_dataset(dataset_uuid=None, store=None, factory=None): """ Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, load_schema=False, store=_make_callable(store), factory=factory, load_dataset_metadata=False, ) # Remove possibly unreferenced files garbage_collect_dataset(factory=ds_factory) # Delete indices first since they do not affect dataset integrity delete_indices(dataset_factory=ds_factory) for metapartition in dispatch_metapartitions_from_factory(ds_factory): metapartition = cast(MetaPartition, metapartition) metapartition.delete_from_store(dataset_uuid=dataset_uuid, store=store) # delete common metadata after partitions delete_common_metadata(dataset_factory=ds_factory) # Delete the top level metadata file delete_top_level_metadata(dataset_factory=ds_factory)
def delete_dataset__delayed(dataset_uuid=None, store=None, factory=None): """ Parameters ---------- """ dataset_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_schema=False, load_dataset_metadata=False, ) gc = garbage_collect_dataset__delayed(factory=dataset_factory) mps = dispatch_metapartitions_from_factory(dataset_factory) delayed_dataset_uuid = delayed(_delete_all_additional_metadata)( dataset_factory=dataset_factory) mps = map_delayed( mps, MetaPartition.delete_from_store, store=store, dataset_uuid=delayed_dataset_uuid, ) return delayed(_delete_tl_metadata)(dataset_factory, mps, gc)
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, columns=None, predicate_pushdown_to_io=True, categoricals=None, dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, predicates=predicates, dispatch_by=dispatch_by, ) for mp in mps: if dispatch_by is not None: mp = MetaPartition.concat_metapartitions( [ mp_inner.load_dataframes( store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ] ) else: mp = cast(MetaPartition, mp) mp = mp.load_dataframes( store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp
def build_dataset_indices__bag( store, dataset_uuid, columns, partition_size=None, factory=None ): """ Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`. This function loads the dataset, computes the requested indices and writes the indices to the dataset. The dataset partitions itself are not mutated. Parameters ---------- partition_size: Optional[int] Dask bag partition size. Use a larger numbers to decrease scheduler load and overhead, use smaller numbers for a fine-grained scheduling and better resilience against worker errors. Returns ------- A dask.delayed computation object. """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) cols_to_load = { table: set(columns) & set(meta.names) for table, meta in ds_factory.table_meta.items() } cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols} mps = dispatch_metapartitions_from_factory(ds_factory) return ( db.from_sequence(seq=mps, partition_size=partition_size) .map( MetaPartition.load_dataframes, store=ds_factory.store_factory, tables=list(cols_to_load.keys()), columns=cols_to_load, ) .map(MetaPartition.build_indices, columns=columns) .map(MetaPartition.remove_dataframes) .reduction(list, list, split_every=False, out_type=db.Bag) .flatten() .map_partitions(list) .map_partitions( update_indices_from_partitions, dataset_metadata_factory=ds_factory ) )
def build_dataset_indices__bag( store: Optional[StoreInput], dataset_uuid: Optional[str], columns: Sequence[str], partition_size: Optional[int] = None, factory: Optional[DatasetFactory] = None, ) -> Delayed: """ Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`. This function loads the dataset, computes the requested indices and writes the indices to the dataset. The dataset partitions itself are not mutated. Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) cols_to_load = { table: set(columns) & set(meta.names) for table, meta in ds_factory.table_meta.items() } cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols} mps = dispatch_metapartitions_from_factory(ds_factory) return ( db.from_sequence(seq=mps, partition_size=partition_size) .map( MetaPartition.load_dataframes, store=ds_factory.store_factory, tables=list(cols_to_load.keys()), columns=cols_to_load, ) .map(MetaPartition.build_indices, columns=columns) .map(MetaPartition.remove_dataframes) .reduction(list, list, split_every=False, out_type=db.Bag) .flatten() .map_partitions(list) .map_partitions( update_indices_from_partitions, dataset_metadata_factory=ds_factory ) )
def get_metapartitions_for_stats(datasets): """ Get all metapartitions that need to be scanned to gather cube stats. Parameters ---------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are present. Returns ------- metapartitions: Tuple[Tuple[str, Tuple[kartothek.io_components.metapartition.MetaPartition, ...]], ...] Pre-aligned metapartitions (by primary index / physical partitions) and the ktk_cube dataset ID belonging to them. """ all_metapartitions = [] for ktk_cube_dataset_id, ds in datasets.items(): dataset_factory = metadata_factory_from_dataset(ds) for mp in dispatch_metapartitions_from_factory( dataset_factory=dataset_factory, dispatch_by=dataset_factory.partition_keys): all_metapartitions.append((ktk_cube_dataset_id, mp)) return all_metapartitions
def build_dataset_indices(store, dataset_uuid, columns, factory=None): """ Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`. This function loads the dataset, computes the requested indices and writes the indices to the dataset. The dataset partitions itself are not mutated. Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) cols_to_load = { table: set(columns) & set(meta.names) for table, meta in ds_factory.table_meta.items() } cols_to_load = { table: cols for table, cols in cols_to_load.items() if cols } new_partitions = [] for mp in dispatch_metapartitions_from_factory(ds_factory): mp = mp.load_dataframes( store=ds_factory.store, tables=list(cols_to_load.keys()), columns=cols_to_load, ) mp = mp.build_indices(columns=columns) mp = mp.remove_dataframes() # Remove dataframe from memory new_partitions.append(mp) return update_indices_from_partitions(new_partitions, dataset_metadata_factory=ds_factory)
def read_dataset_as_metapartitions_bag( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, partition_size=None, dispatch_metadata=True, ): """ Retrieve dataset as `dask.bag.Bag` of `MetaPartition` objects. Parameters ---------- Returns ------- dask.bag.Bag: A dask.bag object containing the metapartions. """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, dispatch_metadata=dispatch_metadata, ) mps = db.from_sequence(mps, partition_size=partition_size) if concat_partitions_on_primary_index or dispatch_by is not None: mps = mps.map( _load_and_concat_metapartitions_inner, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = mps.map( MetaPartition.load_dataframes, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals) if categoricals_from_index: func_dict = defaultdict(_identity) func_dict.update({ table: partial(_cast_categorical_to_index_cat, categories=cats) for table, cats in categoricals_from_index.items() }) mps = mps.map(MetaPartition.apply, func_dict, type_safe=True) return mps
def read_dataset_as_metapartitions_bag( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, partition_size=None, ): """ Retrieve dataset as `dask.bag` of `MetaPartition` objects. Parameters ---------- Returns ------- A dask.bag object containing the metapartions. """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, ) mps = db.from_sequence(mps, partition_size=partition_size) if concat_partitions_on_primary_index or dispatch_by: mps = mps.map( _load_and_concat_metapartitions_inner, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = mps.map( MetaPartition.load_dataframes, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals ) if categoricals_from_index: func_dict = defaultdict(_identity) func_dict.update( { table: partial(_cast_categorical_to_index_cat, categories=cats) for table, cats in categoricals_from_index.items() } ) mps = mps.map(MetaPartition.apply, func_dict, type_safe=True) return mps
def read_dataset_as_delayed_metapartitions( dataset_uuid=None, store=None, columns=None, predicate_pushdown_to_io=True, categoricals: Optional[Sequence[str]] = None, dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, ): """ A collection of dask.delayed objects to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io.dask.read_dataset_as_delayed` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by, ) if dispatch_by is not None: mps = _load_and_concat_metapartitions( mps, store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = map_delayed( MetaPartition.load_dataframes, mps, store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals ) if categoricals_from_index: mps = map_delayed( partial( # type: ignore MetaPartition.apply, func=partial( # type: ignore _cast_categorical_to_index_cat, categories=categoricals_from_index ), type_safe=True, ), mps, ) return list(mps)
def read_dataset_as_delayed_metapartitions( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, dispatch_metadata=True, ): """ A collection of dask.delayed objects to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io.dask.read_dataset_as_delayed` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, dispatch_metadata=dispatch_metadata, ) if concat_partitions_on_primary_index or dispatch_by is not None: mps = _load_and_concat_metapartitions( mps, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = map_delayed( MetaPartition.load_dataframes, mps, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals) if categoricals_from_index: func_dict = defaultdict(_identity) func_dict.update({ table: partial(_cast_categorical_to_index_cat, categories=cats) for table, cats in categoricals_from_index.items() }) mps = map_delayed( partial(MetaPartition.apply, func=func_dict, type_safe=True), mps) return list(mps)
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, ) for mp in mps: if concat_partitions_on_primary_index: mp = MetaPartition.concat_metapartitions([ mp_inner.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ]) else: mp = mp.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, dispatch_metadata=True, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, dispatch_metadata=dispatch_metadata, ) for mp in mps: if concat_partitions_on_primary_index or dispatch_by is not None: mp = MetaPartition.concat_metapartitions( [ mp_inner.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ] ) else: mp = cast(MetaPartition, mp) mp = mp.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp
def read_dataset_as_delayed_metapartitions( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, ): """ A collection of dask.delayed objects to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io.dask.read_dataset_as_delayed` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, ) if concat_partitions_on_primary_index or dispatch_by: mps = _load_and_concat_metapartitions( mps, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = map_delayed( mps, MetaPartition.load_dataframes, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals) if categoricals_from_index: func_dict = defaultdict(_identity) func_dict.update({ table: partial(_cast_categorical_to_index_cat, categories=cats) for table, cats in categoricals_from_index.items() }) mps = map_delayed(mps, MetaPartition.apply, func_dict, type_safe=True) return mps
def collect_dataset_metadata( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, table_name: str = SINGLE_TABLE, predicates: Optional[PredicatesType] = None, frac: float = 1.0, factory: Optional[DatasetFactory] = None, ) -> dd.DataFrame: """ Collect parquet metadata of the dataset. The `frac` parameter can be used to select a subset of the data. .. warning:: If the size of the partitions is not evenly distributed, e.g. some partitions might be larger than others, the metadata returned is not a good approximation for the whole dataset metadata. .. warning:: Using the `frac` parameter is not encouraged for a small number of total partitions. Parameters ---------- predicates Kartothek predicates to apply filters on the data for which to gather statistics .. warning:: Filtering will only be applied for predicates on indices. The evaluation of the predicates therefore will therefore only return an approximate result. frac Fraction of the total number of partitions to use for gathering statistics. `frac == 1.0` will use all partitions. Returns ------- dask.dataframe.DataFrame: A dask.DataFrame containing the following information about dataset statistics: * `partition_label`: File name of the parquet file, unique to each physical partition. * `row_group_id`: Index of the row groups within one parquet file. * `row_group_compressed_size`: Byte size of the data within one row group. * `row_group_uncompressed_size`: Byte size (uncompressed) of the data within one row group. * `number_rows_total`: Total number of rows in one parquet file. * `number_row_groups`: Number of row groups in one parquet file. * `serialized_size`: Serialized size of the parquet file. * `number_rows_per_row_group`: Number of rows per row group. Raises ------ ValueError If no metadata could be retrieved, raise an error. """ if not 0.0 < frac <= 1.0: raise ValueError( f"Invalid value for parameter `frac`: {frac}." "Please make sure to provide a value larger than 0.0 and smaller than or equal to 1.0 ." ) dataset_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) mps = list( dispatch_metapartitions_from_factory(dataset_factory, predicates=predicates) ) if mps: random.shuffle(mps) # ensure that even with sampling at least one metapartition is returned cutoff_index = max(1, int(len(mps) * frac)) mps = mps[:cutoff_index] ddf = dd.from_delayed( [ dask.delayed(MetaPartition.get_parquet_metadata)( mp, store=dataset_factory.store_factory, table_name=table_name ) for mp in mps ], meta=_METADATA_SCHEMA, ) else: df = pd.DataFrame(columns=_METADATA_SCHEMA.keys()) df = df.astype(_METADATA_SCHEMA) ddf = dd.from_pandas(df, npartitions=1) return ddf
def read_dataset_as_metapartitions_bag( dataset_uuid=None, store=None, columns=None, predicate_pushdown_to_io=True, categoricals=None, dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, partition_size=None, ): """ Retrieve dataset as `dask.bag.Bag` of `MetaPartition` objects. Parameters ---------- Returns ------- dask.bag.Bag: A dask.bag object containing the metapartions. """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by, ) mp_bag = db.from_sequence(mps, partition_size=partition_size) if dispatch_by is not None: mp_bag = mp_bag.map( _load_and_concat_metapartitions_inner, store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mp_bag = mp_bag.map( MetaPartition.load_dataframes, store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals ) if categoricals_from_index: mp_bag = mp_bag.map( MetaPartition.apply, func=partial( _cast_categorical_to_index_cat, categories=categoricals_from_index ), type_safe=True, ) return mp_bag