Exemplo n.º 1
0
def build_dataset_indices(store, dataset_uuid, columns, factory=None):
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    new_partitions = []
    for mp in read_dataset_as_metapartitions__iterator(factory=ds_factory):
        mp = mp.build_indices(columns=columns)
        mp = mp.remove_dataframes()  # Remove dataframe from memory
        new_partitions.append(mp)

    return update_indices_from_partitions(new_partitions,
                                          dataset_metadata_factory=ds_factory)
Exemplo n.º 2
0
def build_dataset_indices(store, dataset_uuid, columns, factory=None):
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    cols_to_load = {
        table: set(columns) & set(meta.names)
        for table, meta in ds_factory.table_meta.items()
    }
    cols_to_load = {
        table: cols
        for table, cols in cols_to_load.items() if cols
    }

    new_partitions = []
    for mp in dispatch_metapartitions_from_factory(ds_factory):
        mp = mp.load_dataframes(
            store=ds_factory.store,
            tables=list(cols_to_load.keys()),
            columns=cols_to_load,
        )
        mp = mp.build_indices(columns=columns)
        mp = mp.remove_dataframes()  # Remove dataframe from memory
        new_partitions.append(mp)

    return update_indices_from_partitions(new_partitions,
                                          dataset_metadata_factory=ds_factory)