示例#1
0
文件: audf.py 项目: rubik-ai/Optimus
def filter_row_by_data_type_audf(col_name, data_type):
    """
    Filter a column using a Spark data type as reference
    :param col_name:
    :param data_type:
    :return:
    """

    data_type = parse_python_dtypes(data_type)
    return abstract_udf(col_name, filter_row_by_data_type, "boolean", data_type)
示例#2
0
文件: audf.py 项目: rubik-ai/Optimus
def filter_row_by_data_type(col_name, data_type=None, get_type=False):
    """
    A Pandas UDF function that returns bool if the value match with the data_type param passed to the function.
    Also can return the data type
    :param col_name: Column to be process
    :param data_type: The data_type to be compared with
    :param get_type: Value to be returned as string or boolean
    :return: True or False
    """

    if data_type is not None:
        data_type = parse_python_dtypes(data_type)

    def pandas_udf_func(v):

        return v.apply(Infer.func, args=(data_type, get_type))

    if get_type is True:
        return_data_type = "string"
    else:
        return_data_type = "boolean"

    col_name = one_list_to_val(col_name)
    return F.pandas_udf(pandas_udf_func, return_data_type)(col_name)
示例#3
0
def filter_row_by_data_type(col_name, data_type=None, get_type=False):
    """
    A Pandas UDF function that returns bool if the value match with the data_type param passed to the function.
    Also can return the data type
    :param col_name: Column to be process
    :param data_type: The data_type to be compared with
    :param get_type: Value to be returned as string or boolean
    :return: True or False
    """
    from ast import literal_eval

    if data_type is not None:
        data_type = parse_python_dtypes(data_type)

    def pandas_udf_func(v):
        def str_to_boolean(value):
            """
            Check if a str can be converted to boolean
            :param value:
            :return:
            """
            value = value.lower()
            if value == "true" or value == "false":
                return True

        def str_to_date(value):
            try:
                dateutil.parser.parse(value)
                return True
            except (ValueError, OverflowError):
                pass

        def str_to_array(value):
            """
            Check if value can be parsed to a tuple or and array.
            Because Spark can handle tuples we will try to transform tuples to arrays
            :param value:
            :return:
            """
            try:
                if isinstance(
                        literal_eval((value.encode('ascii',
                                                   'ignore')).decode("utf-8")),
                    (list, tuple)):
                    return True
            except (ValueError, SyntaxError):
                pass

        def func(value):
            """
            Check if a value can be casted to a specific
            :param value: value to be checked
            :return:
            """
            if isinstance(value, bool):
                _data_type = "bool"
            elif fastnumbers.isint(value):  # Check if value is integer
                _data_type = "int"
            elif fastnumbers.isfloat(value):
                _data_type = "float"
            # if string we try to parse it to int, float or bool
            elif isinstance(value, str):
                if str_to_boolean(value):
                    _data_type = "bool"
                elif str_to_date(value):
                    _data_type = "date"
                elif str_to_array(value):
                    _data_type = "array"
                else:
                    _data_type = "string"
            else:
                _data_type = "null"

            if get_type is False:
                if _data_type == data_type:
                    return True
                else:
                    return False
            else:
                return _data_type

        return v.apply(func)

    if get_type is True:
        return_data_type = "string"
    else:
        return_data_type = "boolean"

    col_name = one_list_to_val(col_name)
    return F.pandas_udf(pandas_udf_func, return_data_type)(col_name)