示例#1
0
    def get_indices_as_dataframe(
        self,
        columns: Optional[List[str]] = None,
        date_as_object: bool = True,
        predicates: PredicatesType = None,
    ):
        """
        Converts the dataset indices to a pandas dataframe and filter relevant indices by `predicates`.

        For a dataset with indices on columns `column_a` and `column_b` and three partitions,
        the dataset output may look like

        .. code::

                    column_a column_b
            part_1         1        A
            part_2         2        B
            part_3         3     None

        Parameters
        ----------
        """
        if not self.primary_indices_loaded and columns != []:
            # self.load_partition_indices is not inplace
            dm = self.load_partition_indices()
        else:
            dm = self

        if columns is None:
            columns = sorted(dm.indices.keys())

        if columns == []:
            return pd.DataFrame(index=dm.partitions)

        if predicates:
            predicate_columns = columns_in_predicates(predicates)
            columns_to_scan = sorted(
                (predicate_columns & self.indices.keys()) | set(columns)
            )

            dfs = (
                dm._evaluate_conjunction(
                    columns=columns_to_scan,
                    predicates=[conjunction],
                    date_as_object=date_as_object,
                )
                for conjunction in predicates
            )

            df = pd.concat(dfs)
            index_name = df.index.name
            df = (
                df.loc[:, columns].reset_index().drop_duplicates().set_index(index_name)
            )
        else:
            df = dm._evaluate_conjunction(
                columns=columns, predicates=None, date_as_object=date_as_object,
            )
        return df
示例#2
0
def dispatch_metapartitions_from_factory(
    dataset_factory,
    label_filter=None,
    concat_partitions_on_primary_index=False,
    predicates=None,
    store=None,
    dispatch_by=None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    if not callable(dataset_factory) and not isinstance(
            dataset_factory, DatasetFactory):
        raise TypeError("Need to supply a dataset factory!")

    if dispatch_by and concat_partitions_on_primary_index:
        raise ValueError(
            "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, "
            "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. "
            "Please only provide the `dispatch_by` argument. ")
    if concat_partitions_on_primary_index:
        warnings.warn(
            "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. Use `dispatch_by=dataset_factory.partition_keys` to achieve the same behavior instead.",
            DeprecationWarning,
        )
        dispatch_by = dataset_factory.partition_keys

    if dispatch_by and not set(dispatch_by).issubset(
            set(dataset_factory.index_columns)):
        raise RuntimeError(
            f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}"
        )
    check_predicates(predicates)

    # Determine which indices need to be loaded.
    index_cols: Set[str] = set()
    if dispatch_by:
        index_cols |= set(dispatch_by)

    if predicates:
        predicate_cols = set(columns_in_predicates(predicates))
        predicate_index_cols = predicate_cols & set(
            dataset_factory.index_columns)
        index_cols |= predicate_index_cols

    for col in index_cols:
        dataset_factory.load_index(col)

    base_df = dataset_factory.get_indices_as_dataframe(list(index_cols),
                                                       predicates=predicates)

    if label_filter:
        base_df = base_df[base_df.index.map(label_filter)]

    indices_to_dispatch = {
        name: ix.unload()
        for name, ix in dataset_factory.indices.items()
        if isinstance(ix, ExplicitSecondaryIndex)
    }

    if dispatch_by:
        base_df = cast(pd.DataFrame, base_df)

        # Group the resulting MetaParitions by partition keys or a subset of those keys
        merged_partitions = base_df.groupby(by=list(dispatch_by),
                                            sort=False,
                                            as_index=False)
        for group_name, group in merged_partitions:
            if not isinstance(group_name, tuple):
                group_name = (group_name, )
            mps = []
            logical_conjunction = list(
                zip(dispatch_by, ["=="] * len(dispatch_by), group_name))
            for label in group.index.unique():
                mps.append(
                    MetaPartition.from_partition(
                        partition=dataset_factory.partitions[label],
                        dataset_metadata=dataset_factory.metadata,
                        indices=indices_to_dispatch,
                        metadata_version=dataset_factory.metadata_version,
                        table_meta=dataset_factory.table_meta,
                        partition_keys=dataset_factory.partition_keys,
                        logical_conjunction=logical_conjunction,
                    ))
            yield mps
    else:
        for part_label in base_df.index.unique():
            part = dataset_factory.partitions[part_label]

            yield MetaPartition.from_partition(
                partition=part,
                dataset_metadata=dataset_factory.metadata,
                indices=indices_to_dispatch,
                metadata_version=dataset_factory.metadata_version,
                table_meta=dataset_factory.table_meta,
                partition_keys=dataset_factory.partition_keys,
            )
示例#3
0
def dispatch_metapartitions_from_factory(
    dataset_factory: DatasetFactory,
    label_filter: Optional[Callable] = None,
    concat_partitions_on_primary_index: bool = False,
    predicates: PredicatesType = None,
    store: Optional[StoreInput] = None,
    dispatch_by: Optional[List[str]] = None,
    dispatch_metadata: bool = False,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    """

    :meta private:
    """

    if dispatch_by is not None and concat_partitions_on_primary_index:
        raise ValueError(
            "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, "
            "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. "
            "Please only provide the `dispatch_by` argument. "
        )
    if concat_partitions_on_primary_index:
        dispatch_by = dataset_factory.partition_keys

    if dispatch_by is not None and not set(dispatch_by).issubset(
        set(dataset_factory.index_columns)
    ):
        raise RuntimeError(
            f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}"
        )
    check_predicates(predicates)

    # Determine which indices need to be loaded.
    index_cols: Set[str] = set()
    if dispatch_by:
        index_cols |= set(dispatch_by)

    if predicates:
        predicate_cols = set(columns_in_predicates(predicates))
        predicate_index_cols = predicate_cols & set(dataset_factory.index_columns)
        index_cols |= predicate_index_cols

    for col in index_cols:
        dataset_factory.load_index(col)

    base_df = dataset_factory.get_indices_as_dataframe(
        list(index_cols), predicates=predicates
    )

    if label_filter:
        base_df = base_df[base_df.index.map(label_filter)]

    indices_to_dispatch = {
        name: ix.unload()
        for name, ix in dataset_factory.indices.items()
        if isinstance(ix, ExplicitSecondaryIndex)
    }

    if dispatch_by is not None:
        base_df = cast(pd.DataFrame, base_df)

        if len(dispatch_by) == 0:
            merged_partitions = [((""), base_df)]
        else:
            # Group the resulting MetaParitions by partition keys or a subset of those keys
            merged_partitions = base_df.groupby(
                by=list(dispatch_by), sort=True, as_index=False
            )

        for group_name, group in merged_partitions:
            if not isinstance(group_name, tuple):
                group_name = (group_name,)  # type: ignore
            mps = []
            logical_conjunction = list(
                zip(dispatch_by, ["=="] * len(dispatch_by), group_name)
            )
            for label in group.index.unique():
                mps.append(
                    MetaPartition.from_partition(
                        partition=dataset_factory.partitions[label],
                        dataset_metadata=dataset_factory.metadata
                        if dispatch_metadata
                        else None,
                        indices=indices_to_dispatch if dispatch_metadata else None,
                        metadata_version=dataset_factory.metadata_version,
                        table_meta=dataset_factory.table_meta,
                        partition_keys=dataset_factory.partition_keys,
                        logical_conjunction=logical_conjunction,
                    )
                )
            yield mps
    else:
        for part_label in base_df.index.unique():
            part = dataset_factory.partitions[part_label]

            yield MetaPartition.from_partition(
                partition=part,
                dataset_metadata=dataset_factory.metadata
                if dispatch_metadata
                else None,
                indices=indices_to_dispatch if dispatch_metadata else None,
                metadata_version=dataset_factory.metadata_version,
                table_meta=dataset_factory.table_meta,
                partition_keys=dataset_factory.partition_keys,
            )
示例#4
0
    def get_indices_as_dataframe(
        self,
        columns: Optional[List[str]] = None,
        date_as_object: bool = True,
        predicates: PredicatesType = None,
    ):
        """
        Converts the dataset indices to a pandas dataframe.

        For a dataset with indices on columns `column_a` and `column_b` and three partitions,
        the dataset output may look like

        .. code::

                    column_a column_b
            part_1         1        A
            part_2         2        B
            part_3         3     None

        Parameters
        ----------
        """
        if columns is None:
            columns = sorted(self.indices.keys())
        elif columns == []:
            return pd.DataFrame(index=self.partitions)

        dfs = []
        columns_to_scan = columns[:]
        if predicates:
            predicate_columns = columns_in_predicates(predicates)
            # Don't use set logic to preserve order
            for col in predicate_columns:
                if col not in columns_to_scan and col in self.indices:
                    columns_to_scan.append(col)

        for col in columns_to_scan:
            if col not in self.indices:
                if col in self.partition_keys:
                    raise RuntimeError(
                        "Partition indices not loaded. Please call `DatasetMetadata.load_partition_keys` first."
                    )
                raise ValueError("Index `{}` unknown.")
            df = pd.DataFrame(self.indices[col].as_flat_series(
                partitions_as_index=True,
                date_as_object=date_as_object,
                predicates=predicates,
            ))
            dfs.append(df)

        # start joining with the small ones
        sorted_dfs = sorted(dfs, key=lambda df: len(df))
        result = sorted_dfs.pop(0)
        for df in sorted_dfs:
            result = result.merge(df,
                                  left_index=True,
                                  right_index=True,
                                  copy=False)

        if predicates:
            index_name = result.index.name
            result = (result.loc[:, columns].reset_index().drop_duplicates().
                      set_index(index_name))
            return result
        else:
            return result
示例#5
0
def dispatch_metapartitions_from_factory(
    dataset_factory: DatasetFactory,
    predicates: PredicatesType = None,
    dispatch_by: Optional[List[str]] = None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    """

    :meta private:
    """

    if dispatch_by is not None and not set(dispatch_by).issubset(
        set(dataset_factory.index_columns)
    ):
        raise RuntimeError(
            f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}"
        )
    check_predicates(predicates)

    # Determine which indices need to be loaded.
    index_cols: Set[str] = set()
    if dispatch_by:
        index_cols |= set(dispatch_by)

    if predicates:
        predicate_cols = set(columns_in_predicates(predicates))
        predicate_index_cols = predicate_cols & set(dataset_factory.index_columns)
        index_cols |= predicate_index_cols

    for col in index_cols:
        dataset_factory.load_index(col)

    base_df = dataset_factory.get_indices_as_dataframe(
        list(index_cols), predicates=predicates
    )

    if dispatch_by is not None:
        base_df = cast(pd.DataFrame, base_df)

        if len(dispatch_by) == 0:
            merged_partitions = [((""), base_df)]
        else:
            # Group the resulting MetaParitions by partition keys or a subset of those keys
            merged_partitions = base_df.groupby(
                by=list(dispatch_by), sort=True, as_index=False
            )

        for group_name, group in merged_partitions:
            if not isinstance(group_name, tuple):
                group_name = (group_name,)  # type: ignore
            mps = []
            logical_conjunction = list(
                zip(dispatch_by, ["=="] * len(dispatch_by), group_name)
            )
            for label in group.index.unique():
                mps.append(
                    MetaPartition.from_partition(
                        partition=dataset_factory.partitions[label],
                        metadata_version=dataset_factory.metadata_version,
                        schema=dataset_factory.schema,
                        partition_keys=dataset_factory.partition_keys,
                        logical_conjunction=logical_conjunction,
                        table_name=dataset_factory.table_name,
                    )
                )
            yield mps
    else:
        for part_label in base_df.index.unique():
            part = dataset_factory.partitions[part_label]

            yield MetaPartition.from_partition(
                partition=part,
                metadata_version=dataset_factory.metadata_version,
                schema=dataset_factory.schema,
                partition_keys=dataset_factory.partition_keys,
                table_name=dataset_factory.table_name,
            )