def summary_ccr(ht_ccr: hl.Table, file_output: str, ccr_pct_start: int = 0, ccr_pct_end: int = 100, ccr_pct_bins: int = 10, cumulative_histogram: bool = False, ccr_pct_cutoffs=None) -> None: """ Summarize Coding Constrain Region information (as histogram) per gene. :param ht_ccr: CCR Hail table :param file_output: File output path :param ccr_pct_start: Start of histogram range. :param ccr_pct_end: End of histogram range :param ccr_pct_bins: Number of bins :param cumulative_histogram: Generate a cumulative histogram (rather than to use bins) :param ccr_pct_cutoffs: Cut-offs used to generate the cumulative histogram :return: None """ if ccr_pct_cutoffs is None: ccr_pct_cutoffs = [90, 95, 99] if cumulative_histogram: # generate cumulative counts histogram summary_tb = (ht_ccr .group_by('gene') .aggregate(**{'ccr_above_' + str(ccr_pct_cutoffs[k]): agg.filter(ht_ccr.ccr_pct >= ccr_pct_cutoffs[k], agg.count()) for k in range(0, len(ccr_pct_cutoffs))}) ) else: summary_tb = (ht_ccr .group_by('gene') .aggregate(ccr_bins=agg.hist(ht_ccr.ccr_pct, ccr_pct_start, ccr_pct_end, ccr_pct_bins)) ) # get bin edges as list (expected n_bins + 1) bin_edges = summary_tb.aggregate(agg.take(summary_tb.ccr_bins.bin_edges, 1))[0] # unpack array structure and annotate as individual fields summary_tb = (summary_tb .annotate(**{'ccr_bin_' + str(bin_edges[k]) + '_' + str(bin_edges[k + 1]): summary_tb.ccr_bins.bin_freq[k] for k in range(0, len(bin_edges) - 1)}) .flatten() ) # drop fields fields_to_drop = ['ccr_bins.bin_edges', 'ccr_bins.bin_freq'] summary_tb = (summary_tb .drop(*fields_to_drop) ) # Export summarized table (summary_tb .export(output=file_output) )
def histogram(data, range=None, bins=50, legend=None, title=None): """Create a histogram. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: finite_data = hail.bind(lambda x: hail.case().when(hail.is_finite(x), x).or_missing(), data) start, end = agg_f((aggregators.min(finite_data), aggregators.max(finite_data))) if start is None and end is None: raise ValueError(f"'data' contains no values that are defined and finite") data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') p = figure(title=title, x_axis_label=legend, y_axis_label='Frequency', background_fill_color='#EEEEEE') p.quad( bottom=0, top=data.bin_freq, left=data.bin_edges[:-1], right=data.bin_edges[1:], legend=legend, line_color='black') if data.n_larger > 0: p.quad( bottom=0, top=data.n_larger, left=data.bin_edges[-1], right=(data.bin_edges[-1] + (data.bin_edges[1] - data.bin_edges[0])), line_color='black', fill_color='green', legend='Outliers Above') if data.n_smaller > 0: p.quad( bottom=0, top=data.n_smaller, left=data.bin_edges[0] - (data.bin_edges[1] - data.bin_edges[0]), right=data.bin_edges[0], line_color='black', fill_color='red', legend='Outliers Below') return p
def cumulative_histogram(data, range=None, bins=50, legend=None, title=None, normalize=True, log=False): """Create a cumulative histogram. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. normalize: bool Whether or not the cumulative data should be normalized. log: bool Whether or not the y-axis should be of type log. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: start, end = agg_f((aggregators.min(data), aggregators.max(data))) data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') cumulative_data = np.cumsum(data.bin_freq) + data.n_smaller np.append(cumulative_data, [cumulative_data[-1] + data.n_larger]) num_data_points = max(cumulative_data) if normalize: cumulative_data = cumulative_data / num_data_points if title is not None: title = f'{title} ({num_data_points:,} data points)' if log: p = figure(title=title, x_axis_label=legend, y_axis_label='Frequency', background_fill_color='#EEEEEE', y_axis_type='log') else: p = figure(title=title, x_axis_label=legend, y_axis_label='Frequency', background_fill_color='#EEEEEE') p.line(data.bin_edges[:-1], cumulative_data, line_color='#036564', line_width=3) return p
def histogram(data, range=None, bins=50, legend=None, title=None, log=False, interactive=False): """Create a histogram. Notes ----- `data` can be a :class:`.Float64Expression`, or the result of the :func:`.agg.hist` or :func:`.agg.approx_cdf` aggregators. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. log : bool Plot the log10 of the bin counts. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: if interactive: raise ValueError("'interactive' flag can only be used on data from 'approx_cdf'.") agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: finite_data = hail.bind(lambda x: hail.case().when(hail.is_finite(x), x).or_missing(), data) start, end = agg_f((aggregators.min(finite_data), aggregators.max(finite_data))) if start is None and end is None: raise ValueError(f"'data' contains no values that are defined and finite") data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') elif 'values' in data: cdf = data hist, edges = np.histogram(cdf.values, bins=bins, weights=np.diff(cdf.ranks), density=True) data = Struct(bin_freq=hist, bin_edges=edges, n_larger=0, n_smaller=0) if log: data.bin_freq = [log10(x) for x in data.bin_freq] data.n_larger = log10(data.n_larger) data.n_smaller = log10(data.n_smaller) y_axis_label = 'log10 Frequency' else: y_axis_label = 'Frequency' x_span = data.bin_edges[-1] - data.bin_edges[0] x_start = data.bin_edges[0] - .05 * x_span x_end = data.bin_edges[-1] + .05 * x_span p = figure( title=title, x_axis_label=legend, y_axis_label=y_axis_label, background_fill_color='#EEEEEE', x_range=(x_start, x_end)) q = p.quad( bottom=0, top=data.bin_freq, left=data.bin_edges[:-1], right=data.bin_edges[1:], legend=legend, line_color='black') if data.n_larger > 0: p.quad( bottom=0, top=data.n_larger, left=data.bin_edges[-1], right=(data.bin_edges[-1] + (data.bin_edges[1] - data.bin_edges[0])), line_color='black', fill_color='green', legend='Outliers Above') if data.n_smaller > 0: p.quad( bottom=0, top=data.n_smaller, left=data.bin_edges[0] - (data.bin_edges[1] - data.bin_edges[0]), right=data.bin_edges[0], line_color='black', fill_color='red', legend='Outliers Below') if interactive: def mk_interact(handle): def update(bins=bins, phase=0): if phase > 0 and phase < 1: bins = bins + 1 delta = (cdf.values[-1] - cdf.values[0]) / bins edges = np.linspace(cdf.values[0] - (1 - phase) * delta, cdf.values[-1] + phase * delta, bins) else: edges = np.linspace(cdf.values[0], cdf.values[-1], bins) hist, edges = np.histogram(cdf.values, bins=edges, weights=np.diff(cdf.ranks), density=True) new_data = {'top': hist, 'left': edges[:-1], 'right': edges[1:], 'bottom': np.full(len(hist), 0)} q.data_source.data = new_data bokeh.io.push_notebook(handle) from ipywidgets import interact interact(update, bins=(0, 5*bins), phase=(0, 1, .01)) return p, mk_interact else: return p