def test_filter_df_from_predicates_or_predicates(): df = pd.DataFrame({"A": range(10), "B": ["A", "B"] * 5, "C": range(-10, 0)}) predicates = [[("A", "<", 3)], [("A", ">", 5)], [("B", "==", "non-existent")]] actual = filter_df_from_predicates(df, predicates) expected = pd.DataFrame( data={ "A": [0, 1, 2, 6, 7, 8, 9], "B": ["A", "B", "A", "A", "B", "A", "B"], "C": [-10, -9, -8, -4, -3, -2, -1], }, index=[0, 1, 2, 6, 7, 8, 9], ) pdt.assert_frame_equal(actual, expected) predicates = [[("A", "<", 3)], [("A", ">", 5)], [("B", "==", "B")]] actual = filter_df_from_predicates(df, predicates) # row for (A == 4) is filtered out expected = pd.DataFrame( data={ "A": [0, 1, 2, 3, 5, 6, 7, 8, 9], "B": ["A", "B", "A", "B", "B", "A", "B", "A", "B"], "C": [-10, -9, -8, -7, -5, -4, -3, -2, -1], }, index=[0, 1, 2, 3, 5, 6, 7, 8, 9], ) pdt.assert_frame_equal(actual, expected)
def _apply_partition_key_predicates(self, indices, split_predicates): """ Apply the predicates to the partition_key columns and return the remaining predicates that should be pushed to the DataFrame serialiser. """ # Construct a single line DF with the partition columns schema = self.schema index_df_dct = {} for column, value in indices: pa_dtype = schema[schema.get_field_index(column)].type value = IndexBase.normalize_value(pa_dtype, value) if pa.types.is_date(pa_dtype): index_df_dct[column] = pd.Series( pd.to_datetime([value], infer_datetime_format=True)).dt.date else: dtype = pa_dtype.to_pandas_dtype() index_df_dct[column] = pd.Series([value], dtype=dtype) index_df = pd.DataFrame(index_df_dct) filtered_predicates = [] # We assume that indices on the partition level have been filtered out already in `dispatch_metapartitions`. # `filtered_predicates` should only contain predicates that can be evaluated on parquet level for conjunction in split_predicates: predicates = [conjunction.key_part] if (len(conjunction.key_part) == 0 or len( filter_df_from_predicates( index_df, predicates, strict_date_types=True)) > 0): if len(conjunction.content_part) > 0: filtered_predicates.append(conjunction.content_part) else: # A condititon applies to the whole DataFrame, so we need to # load all data. return None return filtered_predicates
def test_filter_df_from_predicates_empty_in(value): df = pd.DataFrame({"A": [value]}) df["B"] = range(len(df)) predicates = [[("A", "in", [])]] actual = filter_df_from_predicates(df, predicates) expected = df.iloc[[]] pdt.assert_frame_equal(actual, expected, check_categorical=False)
def test_filter_df_from_predicates_bool(op, col): df = pd.DataFrame( {"A": [True, False] * 5, "B": [True, False, None, True, False] * 2} ) value = True predicates = [[(col, op, value)]] actual = filter_df_from_predicates(df, predicates) if pd.api.types.is_categorical(df[col]): df[col] = df[col].astype(df[col].cat.as_ordered().dtype) expected = eval(f"df[df[col] {op} value]") pdt.assert_frame_equal(actual, expected, check_categorical=False)
def test_filter_df_from_predicates(op, data, value): df = pd.DataFrame({"A": data}) df["B"] = range(len(df)) predicates = [[("A", op, value)]] actual = filter_df_from_predicates(df, predicates) if pd.api.types.is_categorical(df["A"]): df["A"] = df["A"].astype(df["A"].cat.as_ordered().dtype) if isinstance(value, datetime.date) and (df["A"].dtype == "datetime64[ns]"): # silence pandas warning value = pd.Timestamp(value) expected = eval(f"df[df['A'] {op} value]") pdt.assert_frame_equal(actual, expected, check_categorical=False)
def test_filter_df_from_predicates(op, col): df = pd.DataFrame( { "A": range(10), "B": ["A", "B"] * 5, "C": pd.Series(["X", "Y"] * 5).astype("category"), "D": pd.Series([datetime.date(2019, 1, 1), datetime.date(2019, 1, 2)] * 5), "E": [datetime.datetime(2019, 1, 1), datetime.datetime(2019, 1, 2)] * 5, } ) ix = 4 value = df[col][ix] predicates = [[(col, op, value)]] actual = filter_df_from_predicates(df, predicates) if pd.api.types.is_categorical(df[col]): df[col] = df[col].astype(df[col].cat.as_ordered().dtype) expected = eval(f"df[df[col] {op} value]") pdt.assert_frame_equal(actual, expected, check_categorical=False)
def filter_df(self, df): """ Filter given DataFrame w/ conjunction. NULL-values will always treated as non-matching. Parameters ---------- df: pandas.DataFrame DataFrame to evaluate on, must contain required column. Returns ------- result: pandas.DataFrame Part of the DataFrame for which the conjunction holds. """ df = df.loc[df[list(self.columns)].notnull().all(axis=1)] predicate = self.predicate if predicate is None: # kartothek does not support empty predicate lists return df else: return filter_df_from_predicates(df, [self.predicate])
def as_flat_series( self, compact: bool = False, partitions_as_index: bool = False, date_as_object: bool = False, predicates: PredicatesType = None, ): """ Convert the Index object to a pandas.Series Parameters ---------- compact: If True, ensures that the index will be unique. If there a multiple partition values per index, there values will be compacted into a list (see Examples section). partitions_as_index: If True, the relation between index values and partitions will be reverted for the output dataframe: partition values will be used as index and the indices will be mapped to the partitions. predicates: A list of predicates. If a literal within the provided predicates references a column which is not part of this index, this literal is interpreted as True. Examples: .. code:: >>> index1 = ExplicitSecondaryIndex( ... column="col", index_dct={1: ["part_1", "part_2"]}, dtype=pa.int64() ... ) >>> index1 col 1 part_1 1 part_2 >>> index1.as_flat_series(compact=True) col 1 [part_1, part_2] >>> index1.as_flat_series(partitions_as_index=True) partition part_1 1 part_2 1 """ check_predicates(predicates) table = _index_dct_to_table(self.index_dct, column=self.column, dtype=self.dtype) df = table.to_pandas(date_as_object=date_as_object) if predicates is not None: # If there is a conjunction without any reference to the index # column the entire predicates expression is evaluated to True. In # this case we do not need to filter the dataframe anymore for conjunction in predicates: new_conjunction = filter_predicates_by_column([conjunction], [self.column]) if new_conjunction is None: break else: filtered_predicates = filter_predicates_by_column( predicates, [self.column]) df = filter_df_from_predicates(df, predicates=filtered_predicates) result_column = _PARTITION_COLUMN_NAME # This is the way the dictionary is directly translated # value: [partition] if compact and not partitions_as_index: return df.set_index(self.column)[result_column] # In all other circumstances we need a flat series first # value: part_1 # value: part_2 # value2: part_1 if partitions_as_index or not compact: if len(df) == 0: keys = np.array([], dtype=df[_PARTITION_COLUMN_NAME].values.dtype) else: keys = np.concatenate(df[_PARTITION_COLUMN_NAME].values) lengths = df[_PARTITION_COLUMN_NAME].apply(len).values lengths = lengths.astype(int) values_index = np.repeat(np.arange(len(df)), lengths) values = df[self.column].values[values_index] df = pd.DataFrame({ _PARTITION_COLUMN_NAME: keys, self.column: values }) # if it is not inverted and not compact, we're done if partitions_as_index: result_index = _PARTITION_COLUMN_NAME if compact: df = df.groupby( df[result_index]).apply(lambda x: x[self.column].tolist()) df.name = self.column else: df = df.set_index(result_index)[self.column] else: df = df.set_index(self.column)[_PARTITION_COLUMN_NAME] return df