def describe_float_1d(df, column, current_result, nrows): stats_df = df.select(column).na.drop().agg( mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum")).toPandas() for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr( "percentile_approx(`{col}`,CAST({n} AS DOUBLE))".format( col=column, n=x)).toPandas().iloc[:, 0]) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column).na.drop().select( df_abs(col(column) - stats["mean"]).alias("delta")).agg( df_sum(col("delta"))).toPandas().iloc[0, 0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) return stats
def describe_categorical_1d(df, column): count_column_name = "count({c})".format(c=column) value_counts = (df.select(column).na.drop() .groupBy(column) .agg(count(col(column))) .orderBy(count_column_name, ascending=False) ).cache() # Get the top 50 classes by value count, # and put the rest of them grouped at the # end of the Series: top_50 = value_counts.limit(50).toPandas().sort_values(count_column_name, ascending=False) stats = top_50.take([0]).rename(columns={column: 'top', count_column_name: 'freq'}).iloc[0] others_count = 0 others_distinct_count = 0 unique_categories_count = value_counts.count() if unique_categories_count > 50: others_count = value_counts.select(df_sum(count_column_name)).toPandas().iloc[0, 0] - top_50[count_column_name].sum() others_distinct_count = unique_categories_count - 50 value_counts.unpersist() top = top_50.set_index(column)[count_column_name] top["***Other Values***"] = others_count top["***Other Values Distinct Count***"] = others_distinct_count stats["value_counts"] = top stats["type"] = "CAT" unparsed_valid_jsons = df.select(column).na.drop().rdd.map( lambda x: guess_json_type(x[column])).filter( lambda x: x).distinct().collect() stats["unparsed_json_types"] = unparsed_valid_jsons return stats
def kurtosis_custom(column, mean, count): return ((count*df_sum(df_pow(column - mean, int(4)))) / df_pow(df_sum(df_pow(column - mean, int(2))),2)) -3
def skewness_custom(column, mean, count): return ((np.sqrt(count) * df_sum(df_pow(column - mean, int(3)))) / df_pow(sqrt(df_sum(df_pow(column - mean, int(2)))),3))
def variance_custom(column, mean, count): return df_sum(df_pow(column - mean, int(2))) / float(count-1)
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] for x in [0.05, 0.25, 0.5, 0.75, 0.95]: stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().iloc[:,0] ) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
def kurtosis_custom(column, mean, count): return ((count*df_sum(df_pow(column - mean, int(4)))) / df_pow(df_sum(df_pow(column - mean, int(2))),2)) -3
def skewness_custom(column, mean, count): return ((np.sqrt(count) * df_sum(df_pow(column - mean, int(3)))) / df_pow(sqrt(df_sum(df_pow(column - mean, int(2)))),3))
def variance_custom(column, mean, count): return df_sum(df_pow(column - mean, int(2))) / float(count-1)
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum") ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum") ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().ix[:,0] ) stats = stats_df.ix[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column)==0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
def describe_1d(df, column, nrows, lookup_config=None): column_type = df.select(column).dtypes[0][1] # TODO: think about implementing analysis for complex # data types: if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type): raise NotImplementedError( "Column {c} is of type {t} and cannot be analyzed".format( c=column, t=column_type)) results_data = df.select( countDistinct(col(column)).alias("distinct_count"), df_sum(when(col(column).isNotNull(), 1).otherwise(0)).alias('count')).toPandas() results_data["p_unique"] = results_data["distinct_count"] / float( results_data["count"]) results_data["is_unique"] = results_data["distinct_count"] == nrows results_data["n_missing"] = nrows - results_data["count"] results_data["p_missing"] = results_data["n_missing"] / float(nrows) results_data["p_infinite"] = 0 results_data["n_infinite"] = 0 result = results_data.iloc[0].copy() result["memorysize"] = 0 result.name = column if result["distinct_count"] <= 1: result = result.append(describe_constant_1d(df, column)) elif column_type in {"tinyint", "smallint", "int", "bigint"}: result = result.append( describe_integer_1d(df, column, result, nrows)) elif column_type in {"float", "double", "decimal"}: result = result.append(describe_float_1d(df, column, result, nrows)) elif column_type in {"date", "timestamp"}: result = result.append(describe_date_1d(df, column)) elif result["is_unique"] == True: result = result.append(describe_unique_1d(df, column)) else: result = result.append(describe_categorical_1d(df, column)) # Fix to also count MISSING value in the distict_count field: if result["n_missing"] > 0: result["distinct_count"] = result["distinct_count"] + 1 # TODO: check whether it is worth it to # implement the "real" mode: if (result["count"] > result["distinct_count"] > 1): try: result["mode"] = result["top"] except KeyError: result["mode"] = 0 else: try: result["mode"] = result["value_counts"].index[0] except KeyError: result["mode"] = 0 # If and IndexError happens, # it is because all column are NULLs: except IndexError: result["mode"] = "MISSING" if lookup_config: lookup_object = lookup_config['object'] col_name_in_db = lookup_config[ 'col_name_in_db'] if 'col_name_in_db' in lookup_config else None try: matched, unmatched = lookup_object.lookup( df.select(column), col_name_in_db) result['lookedup_values'] = str(matched.count()) + "/" + str( df.select(column).count()) except: result['lookedup_values'] = 'FAILED' else: result['lookedup_values'] = '' return result