def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0: # Series is not numeric or is all NaNs. return None logger.debug("column_summary - " + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ["mean", "min", "max", "std", "sum"]: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult["n"] = column_props[col]["notnulls"] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult["percentiles"] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult["median"] = colresult["percentiles"][50] colresult["iqr"] = (colresult["percentiles"][75] - colresult["percentiles"][25]) # Compute the t-digest. logger.debug("column_summary - {} - creating TDigest...".format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug("column_summary - {} - testing log trans...".format(col)) try: colresult["logtrans"] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning("test_logtrans has failed for column `{}`: {}".format( col, e)) colresult["logtrans"] = False if colresult["logtrans"]: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult["logtrans_mean"] = _tdigest_mean(logdigest) colresult["logtrans_std"] = _tdigest_std(logdigest) colresult["logtrans_IQR"] = logdigest.percentile( 75) - logdigest.percentile(25) logger.debug("column_summary - {} - should {}be log-transformed".format( col, "NOT " if not colresult["logtrans"] else "")) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug("column_summary - {} - computing histogram...".format(col)) if column_props[col]["is_categorical"]: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult["logtrans"]: counts, log_edges = np.histogram(np.log10(data), density=False, bins="fd") edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins="fd") colresult["histogram"] = { "counts": counts.tolist(), "bin_edges": edges.tolist(), } # Compute KDE logger.debug("column_summary - {} - computing KDE...".format(col)) bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1) logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw)) if column_props[col]["is_categorical"]: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult["min"], colresult["max"] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult["logtrans"]) colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()} return {col: colresult, "_columns": [col]}
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0: # Series is not numeric or is all NaNs. return None logger.debug('column_summary - ' + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ['mean', 'min', 'max', 'std', 'sum']: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult['n'] = column_props[col]['notnulls'] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult['percentiles'] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult['median'] = colresult['percentiles'][50] colresult['iqr'] = (colresult['percentiles'][75] - colresult['percentiles'][25]) # Compute the t-digest. logger.debug('column_summary - {} - creating TDigest...'.format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug('column_summary - {} - testing log trans...'.format(col)) try: colresult['logtrans'] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning('test_logtrans has failed for column `{}`: {}'.format( col, e)) colresult['logtrans'] = False if colresult['logtrans']: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult['logtrans_mean'] = _tdigest_mean(logdigest) colresult['logtrans_std'] = _tdigest_std(logdigest) colresult['logtrans_IQR'] = (logdigest.percentile(75) - logdigest.percentile(25)) logger.debug('column_summary - {} - should {}be log-transformed'.format( col, 'NOT ' if not colresult['logtrans'] else '')) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug('column_summary - {} - computing histogram...'.format(col)) if column_props[col]['is_categorical']: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult['logtrans']: counts, log_edges = np.histogram(np.log10(data), density=False, bins='fd') edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins='fd') colresult['histogram'] = { 'counts': counts.tolist(), 'bin_edges': edges.tolist() } # Compute KDE logger.debug('column_summary - {} - computing KDE...'.format(col)) bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1) logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw)) if column_props[col]['is_categorical']: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult['min'], colresult['max'] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult['logtrans']) colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()} return {col: colresult, '_columns': [col]}