def histogram(data, range=None, bins=50, legend=None, title=None): """Create a histogram. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: finite_data = hail.bind(lambda x: hail.case().when(hail.is_finite(x), x).or_missing(), data) start, end = agg_f((aggregators.min(finite_data), aggregators.max(finite_data))) if start is None and end is None: raise ValueError(f"'data' contains no values that are defined and finite") data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') p = figure(title=title, x_axis_label=legend, y_axis_label='Frequency', background_fill_color='#EEEEEE') p.quad( bottom=0, top=data.bin_freq, left=data.bin_edges[:-1], right=data.bin_edges[1:], legend=legend, line_color='black') if data.n_larger > 0: p.quad( bottom=0, top=data.n_larger, left=data.bin_edges[-1], right=(data.bin_edges[-1] + (data.bin_edges[1] - data.bin_edges[0])), line_color='black', fill_color='green', legend='Outliers Above') if data.n_smaller > 0: p.quad( bottom=0, top=data.n_smaller, left=data.bin_edges[0] - (data.bin_edges[1] - data.bin_edges[0]), right=data.bin_edges[0], line_color='black', fill_color='red', legend='Outliers Below') return p
def histogram(data, range=None, bins=50, legend=None, title=None, log=False, interactive=False): """Create a histogram. Notes ----- `data` can be a :class:`.Float64Expression`, or the result of the :func:`.agg.hist` or :func:`.agg.approx_cdf` aggregators. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. log : bool Plot the log10 of the bin counts. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: if interactive: raise ValueError("'interactive' flag can only be used on data from 'approx_cdf'.") agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: finite_data = hail.bind(lambda x: hail.case().when(hail.is_finite(x), x).or_missing(), data) start, end = agg_f((aggregators.min(finite_data), aggregators.max(finite_data))) if start is None and end is None: raise ValueError(f"'data' contains no values that are defined and finite") data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') elif 'values' in data: cdf = data hist, edges = np.histogram(cdf.values, bins=bins, weights=np.diff(cdf.ranks), density=True) data = Struct(bin_freq=hist, bin_edges=edges, n_larger=0, n_smaller=0) if log: data.bin_freq = [log10(x) for x in data.bin_freq] data.n_larger = log10(data.n_larger) data.n_smaller = log10(data.n_smaller) y_axis_label = 'log10 Frequency' else: y_axis_label = 'Frequency' x_span = data.bin_edges[-1] - data.bin_edges[0] x_start = data.bin_edges[0] - .05 * x_span x_end = data.bin_edges[-1] + .05 * x_span p = figure( title=title, x_axis_label=legend, y_axis_label=y_axis_label, background_fill_color='#EEEEEE', x_range=(x_start, x_end)) q = p.quad( bottom=0, top=data.bin_freq, left=data.bin_edges[:-1], right=data.bin_edges[1:], legend=legend, line_color='black') if data.n_larger > 0: p.quad( bottom=0, top=data.n_larger, left=data.bin_edges[-1], right=(data.bin_edges[-1] + (data.bin_edges[1] - data.bin_edges[0])), line_color='black', fill_color='green', legend='Outliers Above') if data.n_smaller > 0: p.quad( bottom=0, top=data.n_smaller, left=data.bin_edges[0] - (data.bin_edges[1] - data.bin_edges[0]), right=data.bin_edges[0], line_color='black', fill_color='red', legend='Outliers Below') if interactive: def mk_interact(handle): def update(bins=bins, phase=0): if phase > 0 and phase < 1: bins = bins + 1 delta = (cdf.values[-1] - cdf.values[0]) / bins edges = np.linspace(cdf.values[0] - (1 - phase) * delta, cdf.values[-1] + phase * delta, bins) else: edges = np.linspace(cdf.values[0], cdf.values[-1], bins) hist, edges = np.histogram(cdf.values, bins=edges, weights=np.diff(cdf.ranks), density=True) new_data = {'top': hist, 'left': edges[:-1], 'right': edges[1:], 'bottom': np.full(len(hist), 0)} q.data_source.data = new_data bokeh.io.push_notebook(handle) from ipywidgets import interact interact(update, bins=(0, 5*bins), phase=(0, 1, .01)) return p, mk_interact else: return p
def is_finite_or_missing(x): return (hl.or_missing(hl.is_finite(x), x))