示例#1
0
    def apply_by_dtypes(columns,
                        func,
                        func_return_type,
                        args=None,
                        func_type=None,
                        data_type=None):
        """
        Apply a function using pandas udf or udf if apache arrow is not available
        :param columns: Columns in which the function is going to be applied
        :param func: Functions to be applied to a columns
        :param func_return_type
        :param args:
        :param func_type: pandas_udf or udf. If none try to use pandas udf (Pyarrow needed)
        :param data_type:
        :return:
        """
        columns = parse_columns(self, columns)

        for c in columns:
            df = self.cols.apply(c,
                                 func,
                                 func_return_type,
                                 args=args,
                                 func_type=func_type,
                                 when=fbdt(c, data_type))
        return df
示例#2
0
        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """
            temp = col_name + "_type"
            # Count by data type
            types = df.withColumn(temp, fbdt(
                col_name, get_type=True)).groupBy(temp).count().collect()

            count_by_data_type = {}

            for row in types:
                count_by_data_type[row[0]] = row[1]

            # Fill missing data types with 0
            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            count_empty_strings = df.where(F.col(col_name) == '').count()
            count_by_data_type[
                'string'] = count_by_data_type['string'] - count_empty_strings

            # if the data type is string we try to infer
            data_types_count = {
                "string": count_by_data_type['string'],
                "bool": count_by_data_type['bool'],
                "int": count_by_data_type['int'],
                "float": count_by_data_type['float'],
                "date": count_by_data_type['date']
            }

            null_missed_count = {
                "null": count_by_data_type['null'],
                "missing": count_empty_strings,
            }

            # Get the greatest count by column data type
            greatest_data_type_count = max(data_types_count,
                                           key=data_types_count.get)

            if greatest_data_type_count is "string":
                cat = "categorical"
            elif greatest_data_type_count is "int" or greatest_data_type_count is "float":
                cat = "numeric"
            elif greatest_data_type_count is "date":
                cat = "date"
            elif greatest_data_type_count is "bool":
                cat = "bool"
            else:
                cat = "null"

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**data_types_count, **null_missed_count}

            return col
示例#3
0
 def drop_by_dtypes(col_name, data_type=None):
     """
     Drop rows by cell data type
     :param col_name: Column in which the filter is going to be apllied
     :param data_type: filter by string, integer, float or boolean
     :return: Spark DataFrame
     """
     validate_columns_names(self, col_name)
     return self.rows.drop(fbdt(col_name, data_type))
示例#4
0
    def select_by_dtypes(col_name, data_type=None):
        """
        This function has built in order to filter some type of row depending of the var type detected by python
        for Example if you have a column with
        | a |
        | 1 |
        | b |

        and you filter by type = integer you will get

        | 1 |

        :param col_name: Column to be filtered
        :param data_type: Datatype use filter values
        :return: Spark DataFrame
        """
        col_name = parse_columns(self, col_name)

        return self.where(fbdt(col_name, data_type))
示例#5
0
        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """
            logger.print("Processing column '" + col_name + "'...")
            # If String, process the data to try to infer which data type is inside. This a kind of optimization.
            # We do not need to analyze the data if the column data type is integer or boolean.etc

            temp = col_name + "_type"
            col_data_type = df.cols.dtypes(col_name)

            count_by_data_type = {}
            count_empty_strings = 0

            if infer is True and col_data_type == "string":

                types = (df.h_repartition(col_name=col_name).withColumn(
                    temp, fbdt(col_name,
                               get_type=True)).groupBy(temp).count().to_json())

                for row in types:
                    count_by_data_type[row[temp]] = row["count"]

                count_empty_strings = df.where(F.col(col_name) == '').count()

            else:
                nulls = df.cols.count_na(col_name)
                count_by_data_type[col_data_type] = int(df.count()) - nulls
                count_by_data_type["null"] = nulls

            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            data_types_count = {
                "string": count_by_data_type['string'],
                "bool": count_by_data_type['bool'],
                "int": count_by_data_type['int'],
                "float": count_by_data_type['float'],
                "double": count_by_data_type['double'],
                "date": count_by_data_type['date'],
                "array": count_by_data_type['array']
            }

            null_missed_count = {
                "null": count_by_data_type['null'],
                "missing": count_empty_strings,
            }
            # Get the greatest count by column data type
            greatest_data_type_count = max(data_types_count,
                                           key=data_types_count.get)

            if greatest_data_type_count is "string":
                cat = "categorical"
            elif greatest_data_type_count is "int" or greatest_data_type_count is "float" or greatest_data_type_count is "double":
                cat = "numeric"
            elif greatest_data_type_count is "date":
                cat = "date"
            elif greatest_data_type_count is "bool":
                cat = "bool"
            elif greatest_data_type_count is "array":
                cat = "array"
            else:
                cat = "null"

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**data_types_count, **null_missed_count}

            return col
示例#6
0

# +
def func(val, attr):
    return val + attr


df.cols.apply(["num", "new_col_1"], func, "int", 10).table()
# -

# ### Select row where column "filter" is "integer"

# +
from optimus.functions import filter_row_by_data_type as fbdt

df.rows.select(fbdt("filter", "integer")).table()
# -

# ### Create an abstract dataframe to filter a rows where the value of column "num"> 1

# +
from optimus.functions import abstract_udf as audf


def func(val, attr):
    return val > 1


df.rows.select(audf("num", func, "boolean")).table()
# -