def expr(_when): main_query = audf(c, func, func_return_type, args, func_type, verbose=verbose) if when is not None: # Use the data type to filter the query main_query = F.when(_when, main_query).otherwise(F.col(c)) return main_query
def apply_expr(columns, func=None, args=None, filter_col_by_dtypes=None, verbose=True): """ Apply a expression to column. :param columns: Columns in which the function is going to be applied :param func: function to be applied :type func: A plain expression or a function :param args: Argument passed to the function :param filter_col_by_dtypes: Only apply the filter to specific type of value ,integer, float, string or bool :param verbose: Print additional information about :return: Dataframe """ # It handle if func param is a plain expression or a function returning and expression def func_col_exp(col_name, attr): return func if is_(func, F.Column): _func = func_col_exp else: _func = func columns = parse_columns(self, columns, filter_by_column_dtypes=filter_col_by_dtypes, accepts_missing_cols=True) df = self for col_name in columns: df = df.withColumn(col_name, audf(col_name, _func, attrs=args, func_type="column_exp", verbose=verbose)) return df
def test_drop_audf(): def func_data_type(value, attr): return value > 1 actual_df = source_df.rows.drop(audf("num", func_data_type, "boolean")) expected_df = op.create.df( [("words", "str", True), ("num", "int", True), ("animals", "str", True), ("thing", StringType(), True), ("second", "int", True), ("filter", StringType(), True)], [(" I like fish ", 1, "dog dog", "housé", 5, "a")]) assert (expected_df.collect() == actual_df.collect())
def _cast(cols, args): """ Helper function to support the multiple params implementation :param cols: :param args: :return: """ # assert validate_columns_names(self, cols_and_types, 0) # cols, attrs = parse_columns(self, cols_and_dtypes, get_args=True) # if parse_spark_dtypes(attr[0]) def cast_factory(cls): # Parse standard data types if get_spark_dtypes_object(cls): func_type = "column_exp" def cast_to_vectors(col_name, attr): return F.col(col_name).cast(get_spark_dtypes_object(cls)) func_return_type = None # Parse to Vector elif is_type(cls, Vectors): func_type = "udf" def cast_to_vectors(val, attr): return Vectors.dense(val) func_return_type = VectorUDT() # Add here any other parse you want else: RaiseIfNot.value_error(cls) return func_return_type, cast_to_vectors, func_type df = self for col, args in zip(cols, args): return_type, func, func_type = cast_factory(args[0]) df = df.withColumn( col, audf(col, func, func_return_type=return_type, attrs=args[0], func_type=func_type, verbose=False)) return df
from optimus.functions import filter_row_by_data_type as fbdt df.rows.select(fbdt("filter", "integer")).table() # - # ### Create an abstract dataframe to filter a rows where the value of column "num"> 1 # + from optimus.functions import abstract_udf as audf def func(val, attr): return val > 1 df.rows.select(audf("num", func, "boolean")).table() # - # ### Create an abstract dataframe (Pandas UDF) to pass two arguments to a function a apply a sum operation # + from optimus.functions import abstract_udf as audf def func(val, attr): return val + attr[0] + attr[1] df.withColumn("num_sum", audf("num", func, "int", [10, 20])).table() # -