Exemplo n.º 1
0
    def columns_stats(self,
                      df,
                      columns,
                      buckets=10,
                      infer=False,
                      relative_error=RELATIVE_ERROR,
                      approx_count=True,
                      mismatch=None):
        """
        Return statistical information about a specific column in json format
        :param df: Dataframe to be processed
        :param columns: Columns that you want to profile
        :param buckets: Create buckets divided by range. Each bin is equal.
        :param infer: try to infer the column datatype
        :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster
        :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster
        :param mismatch:
        :return: json object
        """
        if self.rows_count is None:
            self.rows_count = df.count()
        columns = parse_columns(df, columns)

        # Initialize Objects
        logger.print("Processing Stats For columns...")

        # Get columns data types. This is necessary to make the pertinent histogram calculations.
        type_details = self._count_data_types(df, columns, infer, mismatch)

        # Count the categorical, numerical, boolean and date columns
        count_types = {}
        for value in type_details.values():
            name = value["dtype"]
            if name in count_types:
                count_types[name] += 1
            else:
                count_types[name] = 1

        # List the data types this data set have
        total = 0
        dtypes = []
        for key, value in count_types.items():
            if value > 0:
                dtypes.append(key)
                total = total + 1

        count_types = fill_missing_col_types(count_types)

        columns_info = {}
        columns_info["count_types"] = count_types
        columns_info["total_count_dtypes"] = total
        columns_info["dtypes_list"] = dtypes
        columns_info["columns"] = type_details

        # Aggregation
        stats = Profiler.columns_agg(df, columns, buckets, relative_error,
                                     approx_count)

        # Calculate Frequency
        logger.print("Processing Frequency ...")
        df_freq = df.cols.select("*",
                                 data_type=PYSPARK_NUMERIC_TYPES,
                                 invert=True)
        freq = None
        if df_freq is not None:
            freq = df_freq.cols.frequency("*", buckets, True, self.rows_count)

        # Calculate percentage
        for col_name in columns:
            col_info = {}
            assign(col_info, "stats", stats[col_name], dict)

            if freq is not None:
                if col_name in freq:
                    assign(col_info, "frequency", freq[col_name])

            col_info["stats"].update(
                self.extra_columns_stats(df, col_name, stats))
            assign(col_info, "name", col_name)
            assign(col_info, "column_dtype",
                   columns_info["columns"][col_name]['dtype'])
            assign(col_info, "dtypes_stats",
                   columns_info["columns"][col_name]['stats'])
            assign(col_info, "column_type",
                   columns_info["columns"][col_name]['type'])
            assign(columns_info, "columns." + col_name, col_info, dict)

            assign(col_info, "id", df.cols.get_meta(col_name, "id"))

        return columns_info
Exemplo n.º 2
0
    def count_data_types(df, columns, infer=False):
        """
        Count the number of int, float, string, date and booleans and output the count in json format
        :param df: Dataframe to be processed
        :param columns: Columns to be processed
        :param infer: infer the column datatype
        :return: json
        """
        @time_it
        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """
            logger.print("Processing column '" + col_name + "'...")
            # If String, process the data to try to infer which data type is inside. This a kind of optimization.
            # We do not need to analyze the data if the column data type is integer or boolean.etc

            temp = col_name + "_type"
            col_data_type = df.cols.dtypes(col_name)

            count_by_data_type = {}
            count_empty_strings = 0

            if infer is True and col_data_type == "string":

                types = (df.h_repartition(col_name=col_name).withColumn(
                    temp, fbdt(col_name,
                               get_type=True)).groupBy(temp).count().to_json())

                for row in types:
                    count_by_data_type[row[temp]] = row["count"]

                count_empty_strings = df.where(F.col(col_name) == '').count()

            else:
                nulls = df.cols.count_na(col_name)
                count_by_data_type[col_data_type] = int(df.count()) - nulls
                count_by_data_type["null"] = nulls

            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            data_types_count = {
                "string": count_by_data_type['string'],
                "bool": count_by_data_type['bool'],
                "int": count_by_data_type['int'],
                "float": count_by_data_type['float'],
                "double": count_by_data_type['double'],
                "date": count_by_data_type['date'],
                "array": count_by_data_type['array']
            }

            null_missed_count = {
                "null": count_by_data_type['null'],
                "missing": count_empty_strings,
            }
            # Get the greatest count by column data type
            greatest_data_type_count = max(data_types_count,
                                           key=data_types_count.get)

            if greatest_data_type_count is "string":
                cat = "categorical"
            elif greatest_data_type_count is "int" or greatest_data_type_count is "float" or greatest_data_type_count is "double":
                cat = "numeric"
            elif greatest_data_type_count is "date":
                cat = "date"
            elif greatest_data_type_count is "bool":
                cat = "bool"
            elif greatest_data_type_count is "array":
                cat = "array"
            else:
                cat = "null"

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**data_types_count, **null_missed_count}

            return col

        columns = parse_columns(df, columns)

        # Info from all the columns
        type_details = {c: _count_data_types(c) for c in columns}

        results = {}
        count_types = {}

        # Count the categorical, numerical, boolean and date columns
        for v in type_details.values():
            name = v["type"]
            if name in count_types:
                count_types[name] += 1
            else:
                count_types[name] = 1

        count_types = fill_missing_col_types(count_types)

        results["count_types"] = count_types
        results["columns"] = type_details
        return results
Exemplo n.º 3
0
    def columns_stats(self,
                      df,
                      columns,
                      buckets=10,
                      infer=False,
                      relative_error=RELATIVE_ERROR,
                      approx_count=True,
                      mismatch=None,
                      advanced_stats=True):
        """
        Return statistical information about a specific column in json format
        :param df: Dataframe to be processed
        :param columns: Columns that you want to profile
        :param buckets: Create buckets divided by range. Each bin is equal.
        :param infer: try to infer the column dataType
        :param relative_error: relative error when the percentile is calculated.
        0 more precision/slow 1 less precision/faster
        :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster
        :param mismatch:
        :return: json object
        """

        columns = parse_columns(df, columns)

        # Initialize Objects
        logger.print("Processing Stats For columns...")

        # Get columns data types. This is necessary to make the pertinent histogram calculations.
        count_by_data_type = df.cols.count_by_dtypes(columns,
                                                     infer=infer,
                                                     mismatch=mismatch)

        count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type)

        # Info from all the columns
        type_details = {}

        for col_name in columns:
            # Not count mismatch
            if "mismatch" in count_by_data_type_no_mismatch[col_name]:
                count_by_data_type_no_mismatch[col_name].pop("mismatch")

            # Get the greatest count by column data type
            greatest_data_type_count = max(
                count_by_data_type_no_mismatch[col_name],
                key=count_by_data_type_no_mismatch[col_name].get)
            cat = PYTHON_TO_PROFILER.get(greatest_data_type_count)

            assign(type_details, col_name + ".dtype", greatest_data_type_count,
                   dict)
            assign(type_details, col_name + ".type", cat, dict)
            assign(type_details, col_name + ".stats",
                   count_by_data_type[col_name], dict)

        # Count the categorical, numerical, boolean and date columns
        count_types = {}
        for value in type_details.values():
            name = value["dtype"]
            if name in count_types:
                count_types[name] += 1
            else:
                count_types[name] = 1

        # List the data types this data set have
        dtypes = [key for key, value in count_types.items() if value > 0]

        columns_info = {}
        columns_info["count_types"] = fill_missing_col_types(count_types)
        columns_info["total_count_dtypes"] = len(dtypes)
        columns_info["dtypes_list"] = dtypes
        columns_info["columns"] = type_details

        # Aggregation
        stats = self.columns_agg(df, columns, buckets, relative_error,
                                 approx_count, advanced_stats)

        # Calculate Frequency
        logger.print("Processing Frequency ...")
        # print("COLUMNS",columns)
        df_freq = df.cols.select(columns,
                                 data_type=PYSPARK_NUMERIC_TYPES,
                                 invert=True)

        freq = None
        if df_freq is not None:
            freq = df_freq.cols.frequency("*", buckets, True, self.rows_count)
            # print("FREQUENCY1", freq)
        for col_name in columns:
            col_info = {}
            assign(col_info, "stats", stats[col_name], dict)

            if freq is not None:
                if col_name in freq:
                    # print("ASSIGN")
                    assign(col_info, "frequency", freq[col_name])

            assign(col_info, "name", col_name)
            assign(col_info, "column_dtype",
                   columns_info["columns"][col_name]['dtype'])
            assign(col_info, "dtypes_stats",
                   columns_info["columns"][col_name]['stats'])
            assign(col_info, "column_type",
                   columns_info["columns"][col_name]['type'])
            assign(columns_info, "columns." + col_name, col_info, dict)

            assign(col_info, "id", df.cols.get_meta(col_name, "id"))

        return columns_info
Exemplo n.º 4
0
    def count_data_types(df, columns):
        """
        Count the number of int, float, string and bool in a  in json format
        :param df:
        :param columns:
        :return:
        """

        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """
            temp = col_name + "_type"

            types = df.withColumn(temp, fbdt(col_name, get_type=True)).groupBy(temp).count().collect()

            # Convert the collect result to a list
            # TODO: check if collect_to_dict function can be used here

            count_by_data_type = {}

            for row in types:
                count_by_data_type[row[0]] = row[1]

            # Fill missing data types with 0
            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            count_empty_strings = df.where(F.col(col_name) == '').count()
            count_by_data_type['string'] = count_by_data_type['string'] - count_empty_strings

            # if the data type is string we try to infer
            data_types_count = {"string": count_by_data_type['string'],
                                "bool": count_by_data_type['bool'],
                                "int": count_by_data_type['int'],
                                "float": count_by_data_type['float'],
                                "date": count_by_data_type['date']
                                }

            null_missed_count = {"null": count_by_data_type['null'],
                                 "missing": count_empty_strings,
                                 }

            # Get the greatest count by column data type
            greatest_data_type_count = max(data_types_count, key=data_types_count.get)

            if greatest_data_type_count is "string":
                cat = "categorical"
            elif greatest_data_type_count is "int" or greatest_data_type_count is "float":
                cat = "numeric"
            elif greatest_data_type_count is "date":
                cat = "date"
            elif greatest_data_type_count is "bool":
                cat = "bool"
            else:
                cat = "null"

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**data_types_count, **null_missed_count}

            return col

        columns = parse_columns(df, columns)

        type_details = {c: _count_data_types(c) for c in columns}

        results = {}
        count_types = {}

        # Count the categorical, numerical and date columns
        for v in type_details.values():
            name = v["type"]
            if name in count_types:
                count_types[name] += 1
            else:
                count_types[name] = 1

        count_types = fill_missing_col_types(count_types)

        results["count_types"] = count_types
        results["columns"] = type_details
        return results
Exemplo n.º 5
0
    def _count_data_types(self, df, columns, infer=False, stats=None):
        """
        Count the number of int, float, string, date and booleans and output the count in json format
        :param df: Dataframe to be processed
        :param columns: Columns to be processed
        :param infer: infer the column datatype
        :param stats:
        :return: json
        """

        df_count = self.rows_count

        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """

            # If String, process the data to try to infer which data type is inside. This a kind of optimization.
            # We do not need to analyze the data if the column data type is integer or boolean.etc

            temp = col_name + "_type"
            col_data_type = df.cols.dtypes(col_name)

            # Parse dtype
            if col_data_type == "smallint" or col_data_type == "tinyint":
                col_data_type = "int"
            elif col_data_type == "float" or col_data_type == "double":
                col_data_type = "decimal"
            elif col_data_type.find("array") >= 0:
                col_data_type = "array"

            count_by_data_type = {}
            count_empty_strings = 0

            if infer is True and col_data_type == "string":
                logger.print("Processing column '" + col_name + "'...")
                types = collect_as_dict(df
                                        .h_repartition(col_name=col_name)
                                        .withColumn(temp, fbdt(col_name, get_type=True))
                                        .groupBy(temp).count()
                                        )

                for row in types:
                    count_by_data_type[row[temp]] = row["count"]

                count_empty_strings = df.where(F.col(col_name) == '').count()

            else:
                # if boolean not support count na
                if "count_na" in stats[col_name]:
                    nulls = stats[col_name]["count_na"]
                    count_by_data_type[col_data_type] = int(df_count) - nulls
                    count_by_data_type["null"] = nulls

            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            null_missed_count = {"null": count_by_data_type['null'],
                                 "missing": count_empty_strings,
                                 }
            # Get the greatest count by column data type
            greatest_data_type_count = max(count_by_data_type, key=count_by_data_type.get)

            if greatest_data_type_count == "string" or greatest_data_type_count == "boolean":
                cat = "categorical"
            elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal":
                cat = "numeric"
            elif greatest_data_type_count == "date":
                cat = "date"
            elif greatest_data_type_count == "array":
                cat = "array"
            elif greatest_data_type_count == "binary":
                cat = "binary"
            elif greatest_data_type_count == "null":
                cat = "null"
            else:
                cat = None

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**count_by_data_type, **null_missed_count}

            return col

        columns = parse_columns(df, columns)

        # Info from all the columns
        type_details = {c: _count_data_types(c) for c in columns}

        results = {}
        count_types = {}

        # Count the categorical, numerical, boolean and date columns
        for v in type_details.values():
            name = v["type"]
            if name in count_types:
                count_types[name] += 1
            else:
                count_types[name] = 1

        count_types = fill_missing_col_types(count_types)

        results["count_types"] = count_types
        results["columns"] = type_details
        return results