def as_flat_series( self, compact: bool = False, partitions_as_index: bool = False, date_as_object: bool = False, predicates: PredicatesType = None, ): """ Convert the Index object to a pandas.Series Parameters ---------- compact: If True, ensures that the index will be unique. If there a multiple partition values per index, there values will be compacted into a list (see Examples section). partitions_as_index: If True, the relation between index values and partitions will be reverted for the output dataframe: partition values will be used as index and the indices will be mapped to the partitions. predicates: A list of predicates. If a literal within the provided predicates references a column which is not part of this index, this literal is interpreted as True. Examples: .. code:: >>> index1 = ExplicitSecondaryIndex( ... column="col", index_dct={1: ["part_1", "part_2"]}, dtype=pa.int64() ... ) >>> index1 col 1 part_1 1 part_2 >>> index1.as_flat_series(compact=True) col 1 [part_1, part_2] >>> index1.as_flat_series(partitions_as_index=True) partition part_1 1 part_2 1 """ check_predicates(predicates) table = _index_dct_to_table(self.index_dct, column=self.column, dtype=self.dtype) df = table.to_pandas(date_as_object=date_as_object) if predicates is not None: # If there is a conjunction without any reference to the index # column the entire predicates expression is evaluated to True. In # this case we do not need to filter the dataframe anymore for conjunction in predicates: new_conjunction = filter_predicates_by_column([conjunction], [self.column]) if new_conjunction is None: break else: filtered_predicates = filter_predicates_by_column( predicates, [self.column]) df = filter_df_from_predicates(df, predicates=filtered_predicates) result_column = _PARTITION_COLUMN_NAME # This is the way the dictionary is directly translated # value: [partition] if compact and not partitions_as_index: return df.set_index(self.column)[result_column] # In all other circumstances we need a flat series first # value: part_1 # value: part_2 # value2: part_1 if partitions_as_index or not compact: if len(df) == 0: keys = np.array([], dtype=df[_PARTITION_COLUMN_NAME].values.dtype) else: keys = np.concatenate(df[_PARTITION_COLUMN_NAME].values) lengths = df[_PARTITION_COLUMN_NAME].apply(len).values lengths = lengths.astype(int) values_index = np.repeat(np.arange(len(df)), lengths) values = df[self.column].values[values_index] df = pd.DataFrame({ _PARTITION_COLUMN_NAME: keys, self.column: values }) # if it is not inverted and not compact, we're done if partitions_as_index: result_index = _PARTITION_COLUMN_NAME if compact: df = df.groupby( df[result_index]).apply(lambda x: x[self.column].tolist()) df.name = self.column else: df = df.set_index(result_index)[self.column] else: df = df.set_index(self.column)[_PARTITION_COLUMN_NAME] return df
def dispatch_metapartitions_from_factory( dataset_factory: DatasetFactory, label_filter: Optional[Callable] = None, concat_partitions_on_primary_index: bool = False, predicates: PredicatesType = None, store: Optional[StoreInput] = None, dispatch_by: Optional[List[str]] = None, dispatch_metadata: bool = False, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: """ :meta private: """ if dispatch_by is not None and concat_partitions_on_primary_index: raise ValueError( "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, " "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. " "Please only provide the `dispatch_by` argument. " ) if concat_partitions_on_primary_index: dispatch_by = dataset_factory.partition_keys if dispatch_by is not None and not set(dispatch_by).issubset( set(dataset_factory.index_columns) ): raise RuntimeError( f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}" ) check_predicates(predicates) # Determine which indices need to be loaded. index_cols: Set[str] = set() if dispatch_by: index_cols |= set(dispatch_by) if predicates: predicate_cols = set(columns_in_predicates(predicates)) predicate_index_cols = predicate_cols & set(dataset_factory.index_columns) index_cols |= predicate_index_cols for col in index_cols: dataset_factory.load_index(col) base_df = dataset_factory.get_indices_as_dataframe( list(index_cols), predicates=predicates ) if label_filter: base_df = base_df[base_df.index.map(label_filter)] indices_to_dispatch = { name: ix.unload() for name, ix in dataset_factory.indices.items() if isinstance(ix, ExplicitSecondaryIndex) } if dispatch_by is not None: base_df = cast(pd.DataFrame, base_df) if len(dispatch_by) == 0: merged_partitions = [((""), base_df)] else: # Group the resulting MetaParitions by partition keys or a subset of those keys merged_partitions = base_df.groupby( by=list(dispatch_by), sort=True, as_index=False ) for group_name, group in merged_partitions: if not isinstance(group_name, tuple): group_name = (group_name,) # type: ignore mps = [] logical_conjunction = list( zip(dispatch_by, ["=="] * len(dispatch_by), group_name) ) for label in group.index.unique(): mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], dataset_metadata=dataset_factory.metadata if dispatch_metadata else None, indices=indices_to_dispatch if dispatch_metadata else None, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, ) ) yield mps else: for part_label in base_df.index.unique(): part = dataset_factory.partitions[part_label] yield MetaPartition.from_partition( partition=part, dataset_metadata=dataset_factory.metadata if dispatch_metadata else None, indices=indices_to_dispatch if dispatch_metadata else None, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, )
def dispatch_metapartitions_from_factory( dataset_factory, label_filter=None, concat_partitions_on_primary_index=False, predicates=None, store=None, dispatch_by=None, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: if not callable(dataset_factory) and not isinstance( dataset_factory, DatasetFactory): raise TypeError("Need to supply a dataset factory!") if dispatch_by and concat_partitions_on_primary_index: raise ValueError( "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, " "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. " "Please only provide the `dispatch_by` argument. ") if concat_partitions_on_primary_index: warnings.warn( "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. Use `dispatch_by=dataset_factory.partition_keys` to achieve the same behavior instead.", DeprecationWarning, ) dispatch_by = dataset_factory.partition_keys if dispatch_by and not set(dispatch_by).issubset( set(dataset_factory.index_columns)): raise RuntimeError( f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}" ) check_predicates(predicates) # Determine which indices need to be loaded. index_cols: Set[str] = set() if dispatch_by: index_cols |= set(dispatch_by) if predicates: predicate_cols = set(columns_in_predicates(predicates)) predicate_index_cols = predicate_cols & set( dataset_factory.index_columns) index_cols |= predicate_index_cols for col in index_cols: dataset_factory.load_index(col) base_df = dataset_factory.get_indices_as_dataframe(list(index_cols), predicates=predicates) if label_filter: base_df = base_df[base_df.index.map(label_filter)] indices_to_dispatch = { name: ix.unload() for name, ix in dataset_factory.indices.items() if isinstance(ix, ExplicitSecondaryIndex) } if dispatch_by: base_df = cast(pd.DataFrame, base_df) # Group the resulting MetaParitions by partition keys or a subset of those keys merged_partitions = base_df.groupby(by=list(dispatch_by), sort=False, as_index=False) for group_name, group in merged_partitions: if not isinstance(group_name, tuple): group_name = (group_name, ) mps = [] logical_conjunction = list( zip(dispatch_by, ["=="] * len(dispatch_by), group_name)) for label in group.index.unique(): mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], dataset_metadata=dataset_factory.metadata, indices=indices_to_dispatch, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, )) yield mps else: for part_label in base_df.index.unique(): part = dataset_factory.partitions[part_label] yield MetaPartition.from_partition( partition=part, dataset_metadata=dataset_factory.metadata, indices=indices_to_dispatch, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, )
def dispatch_metapartitions_from_factory( dataset_factory: DatasetFactory, predicates: PredicatesType = None, dispatch_by: Optional[List[str]] = None, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: """ :meta private: """ if dispatch_by is not None and not set(dispatch_by).issubset( set(dataset_factory.index_columns) ): raise RuntimeError( f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}" ) check_predicates(predicates) # Determine which indices need to be loaded. index_cols: Set[str] = set() if dispatch_by: index_cols |= set(dispatch_by) if predicates: predicate_cols = set(columns_in_predicates(predicates)) predicate_index_cols = predicate_cols & set(dataset_factory.index_columns) index_cols |= predicate_index_cols for col in index_cols: dataset_factory.load_index(col) base_df = dataset_factory.get_indices_as_dataframe( list(index_cols), predicates=predicates ) if dispatch_by is not None: base_df = cast(pd.DataFrame, base_df) if len(dispatch_by) == 0: merged_partitions = [((""), base_df)] else: # Group the resulting MetaParitions by partition keys or a subset of those keys merged_partitions = base_df.groupby( by=list(dispatch_by), sort=True, as_index=False ) for group_name, group in merged_partitions: if not isinstance(group_name, tuple): group_name = (group_name,) # type: ignore mps = [] logical_conjunction = list( zip(dispatch_by, ["=="] * len(dispatch_by), group_name) ) for label in group.index.unique(): mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], metadata_version=dataset_factory.metadata_version, schema=dataset_factory.schema, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, table_name=dataset_factory.table_name, ) ) yield mps else: for part_label in base_df.index.unique(): part = dataset_factory.partitions[part_label] yield MetaPartition.from_partition( partition=part, metadata_version=dataset_factory.metadata_version, schema=dataset_factory.schema, partition_keys=dataset_factory.partition_keys, table_name=dataset_factory.table_name, )