예제 #1
0
        def expr(_when):
            main_query = audf(c, func, func_return_type, args, func_type, verbose=verbose)
            if when is not None:
                # Use the data type to filter the query
                main_query = F.when(_when, main_query).otherwise(F.col(c))

            return main_query
예제 #2
0
    def apply_expr(columns, func=None, args=None, filter_col_by_dtypes=None, verbose=True):
        """
        Apply a expression to column.
        :param columns: Columns in which the function is going to be applied
        :param func: function to be applied
        :type func: A plain expression or a function
        :param args: Argument passed to the function
        :param filter_col_by_dtypes: Only apply the filter to specific type of value ,integer, float, string or bool
        :param verbose: Print additional information about
        :return: Dataframe
        """

        # It handle if func param is a plain expression or a function returning and expression
        def func_col_exp(col_name, attr):
            return func

        if is_(func, F.Column):
            _func = func_col_exp
        else:
            _func = func

        columns = parse_columns(self, columns, filter_by_column_dtypes=filter_col_by_dtypes, accepts_missing_cols=True)

        df = self
        for col_name in columns:
            df = df.withColumn(col_name, audf(col_name, _func, attrs=args, func_type="column_exp", verbose=verbose))
        return df
예제 #3
0
    def test_drop_audf():
        def func_data_type(value, attr):
            return value > 1

        actual_df = source_df.rows.drop(audf("num", func_data_type, "boolean"))

        expected_df = op.create.df(
            [("words", "str", True), ("num", "int", True),
             ("animals", "str", True), ("thing", StringType(), True),
             ("second", "int", True), ("filter", StringType(), True)],
            [("  I like     fish  ", 1, "dog dog", "housé", 5, "a")])

        assert (expected_df.collect() == actual_df.collect())
예제 #4
0
    def _cast(cols, args):
        """
        Helper function to support the multiple params implementation
        :param cols:
        :param args:
        :return:
        """

        # assert validate_columns_names(self, cols_and_types, 0)
        # cols, attrs = parse_columns(self, cols_and_dtypes, get_args=True)

        # if parse_spark_dtypes(attr[0])
        def cast_factory(cls):

            # Parse standard data types
            if get_spark_dtypes_object(cls):
                func_type = "column_exp"

                def cast_to_vectors(col_name, attr):
                    return F.col(col_name).cast(get_spark_dtypes_object(cls))

                func_return_type = None

            # Parse to Vector
            elif is_type(cls, Vectors):
                func_type = "udf"

                def cast_to_vectors(val, attr):
                    return Vectors.dense(val)

                func_return_type = VectorUDT()

            # Add here any other parse you want
            else:
                RaiseIfNot.value_error(cls)

            return func_return_type, cast_to_vectors, func_type

        df = self
        for col, args in zip(cols, args):
            return_type, func, func_type = cast_factory(args[0])
            df = df.withColumn(
                col,
                audf(col,
                     func,
                     func_return_type=return_type,
                     attrs=args[0],
                     func_type=func_type,
                     verbose=False))
        return df
예제 #5
0
from optimus.functions import filter_row_by_data_type as fbdt

df.rows.select(fbdt("filter", "integer")).table()
# -

# ### Create an abstract dataframe to filter a rows where the value of column "num"> 1

# +
from optimus.functions import abstract_udf as audf


def func(val, attr):
    return val > 1


df.rows.select(audf("num", func, "boolean")).table()
# -

# ### Create an abstract dataframe (Pandas UDF) to pass two arguments to a function a apply a sum operation

# +
from optimus.functions import abstract_udf as audf


def func(val, attr):
    return val + attr[0] + attr[1]


df.withColumn("num_sum", audf("num", func, "int", [10, 20])).table()

# -