def extra_stats(self, df, col_name, stats): """ Specific Stats for numeric columns :param df: :param col_name: :param stats: :return: """ col_info = {} max_value = stats[col_name]["max"] min_value = stats[col_name]["min"] if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): stddev = stats[col_name]['stddev'] mean = stats[col_name]['mean'] quantile = stats[col_name]["percentile"] col_info['range'] = max_value - min_value col_info['median'] = quantile["0.5"] col_info['interquartile_range'] = quantile["0.75"] - quantile["0.25"] if mean != 0: col_info['coef_variation'] = round((stddev / mean), 5) else: col_info['coef_variation'] = 0 col_info['mad'] = round(df.cols.mad(col_name), 5) col_info['p_count_na'] = round((stats[col_name]['count_na'] * 100) / self.rows_count, 2) col_info['p_count_uniques'] = round((stats[col_name]['count_uniques'] * 100) / self.rows_count, 2) return col_info
def percentile_agg(col_name, df, values, relative_error): """ Return the percentile of a dataframe :param col_name: '*', list of columns names or a single column name. :param df: :param values: list of percentiles to be calculated :param relative_error: If set to zero, the exact percentiles are computed, which could be very expensive. 0 to 1 accepted :return: percentiles per columns """ # Make sure values are double if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] values = val_to_list(values) values = list(map(str, values)) if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): # Get percentiles p = F.expr( "percentile_approx(`{COLUMN}`, array({VALUES}), {ERROR})".format( COLUMN=col_name, VALUES=" , ".join(values), ERROR=relative_error)) # Zip the arrays expr = [[F.lit(v), p.getItem(i)] for i, v in enumerate(values)] expr = F.create_map(*list(itertools.chain(*expr))) else: expr = None # print(expr) return expr
def count_na_agg(col_name, df): # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean # if is_column_a(df, col_name, ["struct", "boolean"]): # df = df.cols.cast(col_name, "string") # Select the nan/null rows depending of the columns data type # If numeric if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): expr = F.count(F.when(match_nulls_integers(col_name), col_name)) # If string. Include 'nan' string elif is_column_a(df, col_name, PYSPARK_STRING_TYPES): expr = F.count(F.when(match_nulls_strings(col_name), col_name)) # print("Including 'nan' as Null in processing string type column '{}'".format(col_name)) else: expr = F.count(F.when(match_null(col_name), col_name)) return expr
def extra_columns_stats(df, col_name, stats): """ Specific Stats for numeric columns :param df: :param col_name: :param stats: :return: """ col_info = {} max_value = stats[col_name]["max"] min_value = stats[col_name]["min"] if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): stddev = stats[col_name]['stddev'] mean = stats[col_name]['mean'] quantile = stats[col_name]["percentile"] if max_value is not None and min_value is not None: col_info['range'] = max_value - min_value else: col_info['range'] = None col_info['median'] = quantile["0.5"] q1 = quantile["0.25"] q3 = quantile["0.75"] if q1 is not None and q3 is not None: col_info['interquartile_range'] = q3 - q1 else: col_info['interquartile_range'] = None if mean != 0 and mean is not None: col_info['coef_variation'] = round((stddev / mean), 5) else: col_info['coef_variation'] = None mad = df.cols.mad(col_name) if mad is not None: col_info['mad'] = round(df.cols.mad(col_name), 5) else: col_info['mad'] = None if self.rows_count is None: self.rows_count = df.count() col_info['p_count_na'] = round( (stats[col_name]['count_na'] * 100) / self.rows_count, 2) col_info['p_count_uniques'] = round( (stats[col_name]['count_uniques'] * 100) / self.rows_count, 2) return col_info
def minimal_stats(df, columns, buckets=10, approx_count=True): columns = parse_columns(df, columns) n = 60 list_columns = [columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n)] # we have problems sending +100 columns at the same time. Process in batch result = {} for i, cols in enumerate(list_columns): logger.print("Batch {BATCH_NUMBER}. Processing columns{COLUMNS}".format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [count_uniques_agg] exprs = df.cols.create_exprs(cols, funcs, approx_count) funcs = [F.min, F.max] exprs.extend(df.cols.create_exprs(cols, funcs)) funcs = [count_na_agg] exprs.extend(df.cols.create_exprs(cols, funcs, df)) result.update(df.cols.exec_agg(exprs)) n = 60 # 40 2:46 seg # 50 2:12 list_columns = [columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n)] for i, cols in enumerate(list_columns): logger.print( "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}".format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [hist_agg] min_max = {} for col_name in cols: if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): min_max = {"min": result[col_name]["min"], "max": result[col_name]["max"]} exprs.extend(df.cols.create_exprs(cols, funcs, df, buckets, min_max)) result.update(df.cols.exec_agg(exprs)) return result
def columns_agg(df, columns, buckets=10, relative_error=RELATIVE_ERROR, approx_count=True): columns = parse_columns(df, columns) n = BATCH_SIZE list_columns = [ columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n) ] # we have problems sending +100 columns at the same time. Process in batch result = {} for i, cols in enumerate(list_columns): logger.print( "Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}". format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [count_uniques_agg] exprs = df.cols.create_exprs(cols, funcs, approx_count) # TODO: in basic calculations funcs = [F.min, F.max] funcs = [ F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, zeros_agg ] exprs.extend(df.cols.create_exprs(cols, funcs)) # TODO: None in basic calculation funcs = [percentile_agg] exprs.extend( df.cols.create_exprs(cols, funcs, df, [0.05, 0.25, 0.5, 0.75, 0.95], relative_error)) funcs = [count_na_agg] exprs.extend(df.cols.create_exprs(cols, funcs, df)) result.update(df.cols.exec_agg(exprs)) exprs = [] n = BATCH_SIZE result_hist = {} list_columns = [ columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n) ] for i, cols in enumerate(list_columns): logger.print( "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}". format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [hist_agg] # min_max = None for col_name in cols: # Only process histogram id numeric. For toher data types using frequency if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): min_max = { "min": result[col_name]["min"], "max": result[col_name]["max"] } buckets = result[col_name]["count_uniques"] - 1 if buckets > MAX_BUCKETS: buckets = MAX_BUCKETS elif buckets == 0: buckets = 1 exprs.extend( df.cols.create_exprs(col_name, funcs, df, buckets, min_max)) agg_result = df.cols.exec_agg(exprs) if agg_result is not None: result_hist.update(agg_result) # Merge results for col_name in result: if col_name in result_hist: result[col_name].update(result_hist[col_name]) return result
def hist_agg(col_name, df, buckets, min_max=None, dtype=None): """ Create a columns expression to calculate a column histogram :param col_name: :param df: :param buckets: :param min_max: Min and max vaule neccesary to calculate the buckets :param dtype: Column datatype to calculate the related histogram. Int, String and Dates return different histograms :return: """ def create_exprs(_input_col, _buckets, _func): def count_exprs(_exprs): return F.sum(F.when(_exprs, 1).otherwise(0)) _exprs = [] for i, b in enumerate(_buckets): lower = b["lower"] upper = b["upper"] if is_numeric(lower): lower = round(lower, 2) if is_numeric(upper): upper = round(upper, 2) if len(_buckets) == 1: count = count_exprs((_func(_input_col) == lower)) else: if i == len(_buckets): count = count_exprs((_func(_input_col) > lower) & (_func(_input_col) <= upper)) else: count = count_exprs((_func(_input_col) >= lower) & (_func(_input_col) < upper)) info = F.create_map(F.lit("count"), count.cast("int"), F.lit("lower"), F.lit(lower), F.lit("upper"), F.lit(upper)).alias("hist_agg" + "_" + _input_col + "_" + str(b["bucket"])) _exprs.append(info) _exprs = F.array(*_exprs).alias("hist" + _input_col) return _exprs def hist_numeric(_min_max, _buckets): if _min_max is None: _min_max = df.agg( F.min(col_name).alias("min"), F.max(col_name).alias("max")).to_dict()[0] if _min_max["min"] is not None and _min_max["max"] is not None: _buckets = create_buckets(_min_max["min"], _min_max["max"], _buckets) _exprs = create_exprs(col_name, _buckets, F.col) else: _exprs = None return _exprs def hist_string(_buckets): _buckets = create_buckets(0, 50, _buckets) func = F.length return create_exprs(col_name, _buckets, func) def hist_date(): now = datetime.datetime.now() current_year = now.year oldest_year = 1950 # Year _buckets = create_buckets(oldest_year, current_year, current_year - oldest_year) func = F.year year = create_exprs(col_name, _buckets, func) # Month _buckets = create_buckets(1, 12, 11) func = F.month month = create_exprs(col_name, _buckets, func) # Day _buckets = create_buckets(1, 31, 31) func = F.dayofweek day = create_exprs(col_name, _buckets, func) # Hour _buckets = create_buckets(0, 23, 23) func = F.hour hour = create_exprs(col_name, _buckets, func) # Min _buckets = create_buckets(0, 60, 60) func = F.minute minutes = create_exprs(col_name, _buckets, func) # Second _buckets = create_buckets(0, 60, 60) func = F.second second = create_exprs(col_name, _buckets, func) exprs = F.create_map(F.lit("years"), year, F.lit("months"), month, F.lit("weekdays"), day, F.lit("hours"), hour, F.lit("minutes"), minutes, F.lit("seconds"), second) return exprs if dtype is not None: col_dtype = dtype[col_name]["dtype"] if col_dtype == "int" or col_dtype == "decimal": exprs = hist_numeric(min_max, buckets) elif col_dtype == "string": exprs = hist_string(buckets) elif col_dtype == "date": exprs = hist_date() else: exprs = None else: if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): exprs = hist_numeric(min_max, buckets) elif is_column_a(df, col_name, "str"): exprs = hist_string(buckets) elif is_column_a(df, col_name, "date") or is_column_a( df, col_name, "timestamp"): exprs = hist_date() else: exprs = None return exprs
def hist_agg(col_name, df, buckets, min_max=None): """ Create a columns expression to calculate a column histogram :param col_name: :param df: :param buckets: :return: """ def create_exprs(_input_col, _buckets, _func): def count_exprs(_exprs): return F.sum(F.when(_exprs, 1).otherwise(0)) _exprs = [] for i, b in enumerate(_buckets): lower = b["lower"] upper = b["upper"] if is_numeric(lower): lower = round(lower, 2) if is_numeric(upper): upper = round(upper, 2) if i == len(_buckets): count = count_exprs((_func(_input_col) > lower) & (_func(_input_col) <= upper)) else: count = count_exprs((_func(_input_col) >= lower) & (_func(_input_col) < upper)) info = F.create_map(F.lit("count"), count.cast("int"), F.lit("lower"), F.lit(lower), F.lit("upper"), F.lit(upper)).alias("hist_agg" + "_" + _input_col + "_" + str(b["bucket"])) _exprs.append(info) _exprs = F.array(*_exprs).alias("hist" + _input_col) # print(_exprs) return _exprs if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): if min_max is None: min_max = df.agg( F.min(col_name).alias("min"), F.max(col_name).alias("max")).to_dict()[0] if min_max["min"] is not None and min_max["max"] is not None: buckets = create_buckets(min_max["min"], min_max["max"], buckets) func = F.col exprs = create_exprs(col_name, buckets, func) else: exprs = None elif is_column_a(df, col_name, "str"): buckets = create_buckets(0, 50, buckets) func = F.length exprs = create_exprs(col_name, buckets, func) elif is_column_a(df, col_name, "date"): now = datetime.datetime.now() current_year = now.year oldest_year = 1950 # Year buckets = create_buckets(oldest_year, current_year, current_year - oldest_year) func = F.year year = create_exprs(col_name, buckets, func) # Month buckets = create_buckets(1, 12, 11) func = F.month month = create_exprs(col_name, buckets, func) # Day buckets = create_buckets(1, 31, 31) func = F.dayofweek day = create_exprs(col_name, buckets, func) # Hour buckets = create_buckets(0, 23, 23) func = F.hour hour = create_exprs(col_name, buckets, func) # Min buckets = create_buckets(0, 60, 60) func = F.minute minutes = create_exprs(col_name, buckets, func) # Second buckets = create_buckets(0, 60, 60) func = F.second second = create_exprs(col_name, buckets, func) exprs = F.create_map(F.lit("years"), year, F.lit("months"), month, F.lit("weekdays"), day, F.lit("hours"), hour, F.lit("minutes"), minutes, F.lit("seconds"), second) else: exprs = None return exprs
def columns_agg(self, df, columns, buckets=10, relative_error=RELATIVE_ERROR, approx_count=True, advanced_stats=True): columns = parse_columns(df, columns) n = BATCH_SIZE list_columns = [ columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n) ] # we have problems sending +100 columns at the same time. Processing in batch result = {} for i, cols in enumerate(list_columns): logger.print( "Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}". format(BATCH_NUMBER=i, COLUMNS=cols)) # Count uniques is necessary for calculate the histogram buckets funcs = [count_uniques_agg] exprs = df.cols.create_exprs(cols, funcs, approx_count) funcs = [F.min, F.max] exprs.extend(df.cols.create_exprs(cols, funcs)) funcs = [count_na_agg] exprs.extend(df.cols.create_exprs(cols, funcs, df)) if advanced_stats is True: funcs = [ F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, zeros_agg ] exprs.extend(df.cols.create_exprs(cols, funcs)) # TODO: None in basic calculation funcs = [percentile_agg] exprs.extend( df.cols.create_exprs(cols, funcs, df, [0.05, 0.25, 0.5, 0.75, 0.95], relative_error)) result.update(df.cols.exec_agg(exprs)) n = BATCH_SIZE result_hist = {} list_columns = [ columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n) ] for i, cols in enumerate(list_columns): logger.print( "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}". format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [hist_agg] for col_name in cols: # Only process histogram for numeric columns. For other data types using frequency if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): min_max = { "min": result[col_name]["min"], "max": result[col_name]["max"] } buckets = result[col_name]["count_uniques"] - 1 if buckets > MAX_BUCKETS: buckets = MAX_BUCKETS elif buckets == 0: buckets = 1 exprs.extend( df.cols.create_exprs(col_name, funcs, df, buckets, min_max)) agg_result = df.cols.exec_agg(exprs) if agg_result is not None: result_hist.update(agg_result) # Merge results for col_name in result: if col_name in result_hist: result[col_name].update(result_hist[col_name]) def extra_columns_stats(df, col_name, stats): """ Specific Stats for numeric columns :param df: :param col_name: :param stats: :return: """ col_info = {} max_value = stats[col_name]["max"] min_value = stats[col_name]["min"] if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): stddev = stats[col_name]['stddev'] mean = stats[col_name]['mean'] quantile = stats[col_name]["percentile"] if max_value is not None and min_value is not None: col_info['range'] = max_value - min_value else: col_info['range'] = None col_info['median'] = quantile["0.5"] q1 = quantile["0.25"] q3 = quantile["0.75"] if q1 is not None and q3 is not None: col_info['interquartile_range'] = q3 - q1 else: col_info['interquartile_range'] = None if mean != 0 and mean is not None: col_info['coef_variation'] = round((stddev / mean), 5) else: col_info['coef_variation'] = None mad = df.cols.mad(col_name) if mad is not None: col_info['mad'] = round(df.cols.mad(col_name), 5) else: col_info['mad'] = None if self.rows_count is None: self.rows_count = df.count() col_info['p_count_na'] = round( (stats[col_name]['count_na'] * 100) / self.rows_count, 2) col_info['p_count_uniques'] = round( (stats[col_name]['count_uniques'] * 100) / self.rows_count, 2) return col_info if advanced_stats is True: for col_name in columns: result.update(extra_columns_stats(df, col_name, result)) return result