def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, mismatch=None): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: Create buckets divided by range. Each bin is equal. :param infer: try to infer the column datatype :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster :param mismatch: :return: json object """ if self.rows_count is None: self.rows_count = df.count() columns = parse_columns(df, columns) # Initialize Objects logger.print("Processing Stats For columns...") # Get columns data types. This is necessary to make the pertinent histogram calculations. type_details = self._count_data_types(df, columns, infer, mismatch) # Count the categorical, numerical, boolean and date columns count_types = {} for value in type_details.values(): name = value["dtype"] if name in count_types: count_types[name] += 1 else: count_types[name] = 1 # List the data types this data set have total = 0 dtypes = [] for key, value in count_types.items(): if value > 0: dtypes.append(key) total = total + 1 count_types = fill_missing_col_types(count_types) columns_info = {} columns_info["count_types"] = count_types columns_info["total_count_dtypes"] = total columns_info["dtypes_list"] = dtypes columns_info["columns"] = type_details # Aggregation stats = Profiler.columns_agg(df, columns, buckets, relative_error, approx_count) # Calculate Frequency logger.print("Processing Frequency ...") df_freq = df.cols.select("*", data_type=PYSPARK_NUMERIC_TYPES, invert=True) freq = None if df_freq is not None: freq = df_freq.cols.frequency("*", buckets, True, self.rows_count) # Calculate percentage for col_name in columns: col_info = {} assign(col_info, "stats", stats[col_name], dict) if freq is not None: if col_name in freq: assign(col_info, "frequency", freq[col_name]) col_info["stats"].update( self.extra_columns_stats(df, col_name, stats)) assign(col_info, "name", col_name) assign(col_info, "column_dtype", columns_info["columns"][col_name]['dtype']) assign(col_info, "dtypes_stats", columns_info["columns"][col_name]['stats']) assign(col_info, "column_type", columns_info["columns"][col_name]['type']) assign(columns_info, "columns." + col_name, col_info, dict) assign(col_info, "id", df.cols.get_meta(col_name, "id")) return columns_info
def count_data_types(df, columns, infer=False): """ Count the number of int, float, string, date and booleans and output the count in json format :param df: Dataframe to be processed :param columns: Columns to be processed :param infer: infer the column datatype :return: json """ @time_it def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ logger.print("Processing column '" + col_name + "'...") # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" col_data_type = df.cols.dtypes(col_name) count_by_data_type = {} count_empty_strings = 0 if infer is True and col_data_type == "string": types = (df.h_repartition(col_name=col_name).withColumn( temp, fbdt(col_name, get_type=True)).groupBy(temp).count().to_json()) for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() else: nulls = df.cols.count_na(col_name) count_by_data_type[col_data_type] = int(df.count()) - nulls count_by_data_type["null"] = nulls count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count data_types_count = { "string": count_by_data_type['string'], "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], "float": count_by_data_type['float'], "double": count_by_data_type['double'], "date": count_by_data_type['date'], "array": count_by_data_type['array'] } null_missed_count = { "null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(data_types_count, key=data_types_count.get) if greatest_data_type_count is "string": cat = "categorical" elif greatest_data_type_count is "int" or greatest_data_type_count is "float" or greatest_data_type_count is "double": cat = "numeric" elif greatest_data_type_count is "date": cat = "date" elif greatest_data_type_count is "bool": cat = "bool" elif greatest_data_type_count is "array": cat = "array" else: cat = "null" col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**data_types_count, **null_missed_count} return col columns = parse_columns(df, columns) # Info from all the columns type_details = {c: _count_data_types(c) for c in columns} results = {} count_types = {} # Count the categorical, numerical, boolean and date columns for v in type_details.values(): name = v["type"] if name in count_types: count_types[name] += 1 else: count_types[name] = 1 count_types = fill_missing_col_types(count_types) results["count_types"] = count_types results["columns"] = type_details return results
def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, mismatch=None, advanced_stats=True): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: Create buckets divided by range. Each bin is equal. :param infer: try to infer the column dataType :param relative_error: relative error when the percentile is calculated. 0 more precision/slow 1 less precision/faster :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster :param mismatch: :return: json object """ columns = parse_columns(df, columns) # Initialize Objects logger.print("Processing Stats For columns...") # Get columns data types. This is necessary to make the pertinent histogram calculations. count_by_data_type = df.cols.count_by_dtypes(columns, infer=infer, mismatch=mismatch) count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type) # Info from all the columns type_details = {} for col_name in columns: # Not count mismatch if "mismatch" in count_by_data_type_no_mismatch[col_name]: count_by_data_type_no_mismatch[col_name].pop("mismatch") # Get the greatest count by column data type greatest_data_type_count = max( count_by_data_type_no_mismatch[col_name], key=count_by_data_type_no_mismatch[col_name].get) cat = PYTHON_TO_PROFILER.get(greatest_data_type_count) assign(type_details, col_name + ".dtype", greatest_data_type_count, dict) assign(type_details, col_name + ".type", cat, dict) assign(type_details, col_name + ".stats", count_by_data_type[col_name], dict) # Count the categorical, numerical, boolean and date columns count_types = {} for value in type_details.values(): name = value["dtype"] if name in count_types: count_types[name] += 1 else: count_types[name] = 1 # List the data types this data set have dtypes = [key for key, value in count_types.items() if value > 0] columns_info = {} columns_info["count_types"] = fill_missing_col_types(count_types) columns_info["total_count_dtypes"] = len(dtypes) columns_info["dtypes_list"] = dtypes columns_info["columns"] = type_details # Aggregation stats = self.columns_agg(df, columns, buckets, relative_error, approx_count, advanced_stats) # Calculate Frequency logger.print("Processing Frequency ...") # print("COLUMNS",columns) df_freq = df.cols.select(columns, data_type=PYSPARK_NUMERIC_TYPES, invert=True) freq = None if df_freq is not None: freq = df_freq.cols.frequency("*", buckets, True, self.rows_count) # print("FREQUENCY1", freq) for col_name in columns: col_info = {} assign(col_info, "stats", stats[col_name], dict) if freq is not None: if col_name in freq: # print("ASSIGN") assign(col_info, "frequency", freq[col_name]) assign(col_info, "name", col_name) assign(col_info, "column_dtype", columns_info["columns"][col_name]['dtype']) assign(col_info, "dtypes_stats", columns_info["columns"][col_name]['stats']) assign(col_info, "column_type", columns_info["columns"][col_name]['type']) assign(columns_info, "columns." + col_name, col_info, dict) assign(col_info, "id", df.cols.get_meta(col_name, "id")) return columns_info
def count_data_types(df, columns): """ Count the number of int, float, string and bool in a in json format :param df: :param columns: :return: """ def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ temp = col_name + "_type" types = df.withColumn(temp, fbdt(col_name, get_type=True)).groupBy(temp).count().collect() # Convert the collect result to a list # TODO: check if collect_to_dict function can be used here count_by_data_type = {} for row in types: count_by_data_type[row[0]] = row[1] # Fill missing data types with 0 count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count count_empty_strings = df.where(F.col(col_name) == '').count() count_by_data_type['string'] = count_by_data_type['string'] - count_empty_strings # if the data type is string we try to infer data_types_count = {"string": count_by_data_type['string'], "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], "float": count_by_data_type['float'], "date": count_by_data_type['date'] } null_missed_count = {"null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(data_types_count, key=data_types_count.get) if greatest_data_type_count is "string": cat = "categorical" elif greatest_data_type_count is "int" or greatest_data_type_count is "float": cat = "numeric" elif greatest_data_type_count is "date": cat = "date" elif greatest_data_type_count is "bool": cat = "bool" else: cat = "null" col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**data_types_count, **null_missed_count} return col columns = parse_columns(df, columns) type_details = {c: _count_data_types(c) for c in columns} results = {} count_types = {} # Count the categorical, numerical and date columns for v in type_details.values(): name = v["type"] if name in count_types: count_types[name] += 1 else: count_types[name] = 1 count_types = fill_missing_col_types(count_types) results["count_types"] = count_types results["columns"] = type_details return results
def _count_data_types(self, df, columns, infer=False, stats=None): """ Count the number of int, float, string, date and booleans and output the count in json format :param df: Dataframe to be processed :param columns: Columns to be processed :param infer: infer the column datatype :param stats: :return: json """ df_count = self.rows_count def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" col_data_type = df.cols.dtypes(col_name) # Parse dtype if col_data_type == "smallint" or col_data_type == "tinyint": col_data_type = "int" elif col_data_type == "float" or col_data_type == "double": col_data_type = "decimal" elif col_data_type.find("array") >= 0: col_data_type = "array" count_by_data_type = {} count_empty_strings = 0 if infer is True and col_data_type == "string": logger.print("Processing column '" + col_name + "'...") types = collect_as_dict(df .h_repartition(col_name=col_name) .withColumn(temp, fbdt(col_name, get_type=True)) .groupBy(temp).count() ) for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() else: # if boolean not support count na if "count_na" in stats[col_name]: nulls = stats[col_name]["count_na"] count_by_data_type[col_data_type] = int(df_count) - nulls count_by_data_type["null"] = nulls count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count null_missed_count = {"null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(count_by_data_type, key=count_by_data_type.get) if greatest_data_type_count == "string" or greatest_data_type_count == "boolean": cat = "categorical" elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal": cat = "numeric" elif greatest_data_type_count == "date": cat = "date" elif greatest_data_type_count == "array": cat = "array" elif greatest_data_type_count == "binary": cat = "binary" elif greatest_data_type_count == "null": cat = "null" else: cat = None col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**count_by_data_type, **null_missed_count} return col columns = parse_columns(df, columns) # Info from all the columns type_details = {c: _count_data_types(c) for c in columns} results = {} count_types = {} # Count the categorical, numerical, boolean and date columns for v in type_details.values(): name = v["type"] if name in count_types: count_types[name] += 1 else: count_types[name] = 1 count_types = fill_missing_col_types(count_types) results["count_types"] = count_types results["columns"] = type_details return results