예제 #1
0
 def drop_by_dtypes(input_cols, data_type=None):
     """
     Drop rows by cell data type
     :param input_cols: Column in which the filter is going to be apllied
     :param data_type: filter by string, integer, float or boolean
     :return: Spark DataFrame
     """
     validate_columns_names(self, input_cols)
     return self.rows.drop(fbdt(input_cols, data_type))
예제 #2
0
 def drop_by_dtypes(input_cols, data_type=None):
     """
     Drop rows by cell data type
     :param input_cols: Column in which the filter is going to be apllied
     :param data_type: filter by string, integer, float or boolean
     :return: Spark DataFrame
     """
     df = self
     validate_columns_names(df, input_cols)
     df = df.rows.drop(fbdt(input_cols, data_type))
     df = df.preserve_meta(self, Actions.DROP_ROW.value, df.cols.names())
     return df
예제 #3
0
    def select_by_dtypes(input_cols, data_type=None):
        """
        This function has built in order to filter some type of row depending of the var type detected by python
        for Example if you have a column with
        | a |
        | 1 |
        | b |

        and you filter by type = integer you will get

        | 1 |

        :param input_cols: Column to be filtered
        :param data_type: Datatype use filter values
        :return: Spark DataFrame
        """
        input_cols = parse_columns(self, input_cols)

        return self.where(fbdt(input_cols, data_type))
예제 #4
0
# ### Create a Pandas UDF function that sum a values(32 in this case) to two columns

# +
def func(val, attr):
    return val + attr


df.cols.apply(["num", "new_col_1"], func, "int", 10).table()
# -

# ### Select row where column "filter" is "integer"

# +

df.rows.select(fbdt("filter", "integer")).table()
# -

# ### Create an abstract dataframe to filter a rows where the value of column "num"> 1

# +
from optimus.audf import abstract_udf as audf, filter_row_by_data_type as fbdt


def func(val, attr):
    return val > 1


df.rows.select(audf("num", func, "boolean")).table()
# -
예제 #5
0
        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """
            logger.print("Processing column '" + col_name + "'...")
            # If String, process the data to try to infer which data type is inside. This a kind of optimization.
            # We do not need to analyze the data if the column data type is integer or boolean.etc

            temp = col_name + "_type"
            col_data_type = df.cols.dtypes(col_name)

            count_by_data_type = {}
            count_empty_strings = 0

            if infer is True and col_data_type == "string":

                types = (df.h_repartition(col_name=col_name).withColumn(
                    temp, fbdt(col_name,
                               get_type=True)).groupBy(temp).count().to_json())

                for row in types:
                    count_by_data_type[row[temp]] = row["count"]

                count_empty_strings = df.where(F.col(col_name) == '').count()

            else:
                nulls = df.cols.count_na(col_name)
                count_by_data_type[col_data_type] = int(df.count()) - nulls
                count_by_data_type["null"] = nulls

            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            data_types_count = {
                "string": count_by_data_type['string'],
                "bool": count_by_data_type['bool'],
                "int": count_by_data_type['int'],
                "float": count_by_data_type['float'],
                "double": count_by_data_type['double'],
                "date": count_by_data_type['date'],
                "array": count_by_data_type['array']
            }

            null_missed_count = {
                "null": count_by_data_type['null'],
                "missing": count_empty_strings,
            }
            # Get the greatest count by column data type
            greatest_data_type_count = max(data_types_count,
                                           key=data_types_count.get)

            if greatest_data_type_count is "string":
                cat = "categorical"
            elif greatest_data_type_count is "int" or greatest_data_type_count is "float" or greatest_data_type_count is "double":
                cat = "numeric"
            elif greatest_data_type_count is "date":
                cat = "date"
            elif greatest_data_type_count is "bool":
                cat = "bool"
            elif greatest_data_type_count is "array":
                cat = "array"
            else:
                cat = "null"

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**data_types_count, **null_missed_count}

            return col
예제 #6
0
        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """

            # If String, process the data to try to infer which data type is inside. This a kind of optimization.
            # We do not need to analyze the data if the column data type is integer or boolean.etc

            temp = col_name + "_type"
            col_data_type = df.cols.dtypes(col_name)

            # Parse dtype
            if col_data_type == "smallint" or col_data_type == "tinyint":
                col_data_type = "int"
            elif col_data_type == "float" or col_data_type == "double":
                col_data_type = "decimal"
            elif col_data_type.find("array") >= 0:
                col_data_type = "array"

            count_by_data_type = {}
            count_empty_strings = 0

            if infer is True and col_data_type == "string":
                logger.print("Processing column '" + col_name + "'...")
                types = collect_as_dict(df
                                        .h_repartition(col_name=col_name)
                                        .withColumn(temp, fbdt(col_name, get_type=True))
                                        .groupBy(temp).count()
                                        )

                for row in types:
                    count_by_data_type[row[temp]] = row["count"]

                count_empty_strings = df.where(F.col(col_name) == '').count()

            else:
                # if boolean not support count na
                if "count_na" in stats[col_name]:
                    nulls = stats[col_name]["count_na"]
                    count_by_data_type[col_data_type] = int(df_count) - nulls
                    count_by_data_type["null"] = nulls

            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            null_missed_count = {"null": count_by_data_type['null'],
                                 "missing": count_empty_strings,
                                 }
            # Get the greatest count by column data type
            greatest_data_type_count = max(count_by_data_type, key=count_by_data_type.get)

            if greatest_data_type_count == "string" or greatest_data_type_count == "boolean":
                cat = "categorical"
            elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal":
                cat = "numeric"
            elif greatest_data_type_count == "date":
                cat = "date"
            elif greatest_data_type_count == "array":
                cat = "array"
            elif greatest_data_type_count == "binary":
                cat = "binary"
            elif greatest_data_type_count == "null":
                cat = "null"
            else:
                cat = None

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**count_by_data_type, **null_missed_count}

            return col