def build_dataset_indices(store, dataset_uuid, columns, factory=None): """ Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`. This function loads the dataset, computes the requested indices and writes the indices to the dataset. The dataset partitions itself are not mutated. Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) new_partitions = [] for mp in read_dataset_as_metapartitions__iterator(factory=ds_factory): mp = mp.build_indices(columns=columns) mp = mp.remove_dataframes() # Remove dataframe from memory new_partitions.append(mp) return update_indices_from_partitions(new_partitions, dataset_metadata_factory=ds_factory)
def _load_metapartitions(*args, **kwargs): return list(read_dataset_as_metapartitions__iterator(*args, **kwargs))
def read_dataset_as_metapartitions( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, dispatch_by=None, ): """ Read a dataset as a list of :class:`kartothek.io_components.metapartition.MetaPartition`. Every element of the list corresponds to a physical partition. Parameters ---------- Returns ------- List[kartothek.io_components.metapartition.MetaPartition] Returns a tuple of the loaded dataframe and the dataset metadata Examples -------- Dataset in store contains two partitions with two files each .. code :: >>> import storefact >>> from kartothek.io.eager import read_dataset_as_dataframe >>> store = storefact.get_store_from_url('s3://bucket_with_dataset') >>> list_mps = read_dataset_as_metapartitions('dataset_uuid', store, 'core') """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) from .iter import read_dataset_as_metapartitions__iterator ds_iter = read_dataset_as_metapartitions__iterator( tables=tables, columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, dispatch_by=dispatch_by, ) return list(ds_iter)