def qqplot(pvals, title: str = None, figsize: tuple = (10, 10)): source = pvals._indices.source if isinstance(source, Table): ht = source.select(p_value=pvals) else: ht = source.select_rows(p_value=pvals).rows() ht = ht.key_by().select('p_value').key_by('p_value').persist() lambda_gc = hl.lambda_gc(ht['p_value']) n = ht.count() ht = ht.annotate( observed_p=-hl.log10(ht['p_value']), expected_p=-hl.log10((hl.scan.count() + 1) / n), p_val=ht['p_value'] ).persist() p_val_pd = ht.to_pandas() p_val_pd['observed_p'].values[p_val_pd['observed_p'] > 10] = 10 mini = min(p_val_pd['expected_p'].max(), p_val_pd['observed_p'].max()) maxi = max(p_val_pd['expected_p'].max(), p_val_pd['observed_p'].max()) title = f'{title}' if title else 'QQ Plot' fig = plt.figure(figsize=figsize) plt.scatter(p_val_pd['expected_p'], p_val_pd['observed_p'], c='black', s=0.5) plt.plot((0, mini), (0, mini), 'red') plt.xlim([0, maxi + 0.5]) plt.ylim([0, maxi + 0.5]) plt.title(title, fontsize=20) plt.ylabel('Observed -log10(' + r'$p$' + ')', fontsize=15) plt.xlabel('Expected -log10(' + r'$p$' + ')', fontsize=15) plt.close() return fig, round(lambda_gc, 3)
def compute_same_hap_log_like(n, p, q, x): res = ( hl.cond( q > 0, hl.fold( lambda i, j: i + j[0] * j[1], 0.0, hl.zip(gt_counts, [ hl.log10(x) * 2, hl.log10(2 * x * e), hl.log10(e) * 2, hl.log10(2 * x * p), hl.log10(2 * (p * e + x * q)), hl.log10(2 * q * e), hl.log10(p) * 2, hl.log10(2 * p * q), hl.log10(q) * 2 ])), -1e31 # Very large negative value if no q is present )) # If desired, add distance posterior based on value derived from regression if distance is not None: res = res + hl.max(-6, hl.log10(0.97 - 0.03 * hl.log(distance + 1))) return res
def qq(pvals, collect_all=False, n_divisions=500): """Create a Quantile-Quantile plot. (https://en.wikipedia.org/wiki/Q-Q_plot) Parameters ---------- pvals : List[float] or :class:`.Float64Expression` P-values to be plotted. collect_all : bool Whether to collect all values or downsample before plotting. This parameter will be ignored if pvals is a Python object. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(pvals, Expression): source = pvals._indices.source if source is not None: if collect_all: pvals = pvals.collect() spvals = sorted(filter(lambda x: x and not(isnan(x)), pvals)) exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)] obs = [-log(p, 10) for p in spvals] else: if isinstance(source, Table): ht = source.select(pval=pvals).key_by().persist().key_by('pval') else: ht = source.select_rows(pval=pvals).rows().key_by().select('pval').persist().key_by('pval') n = ht.count() ht = ht.select(idx=hail.scan.count()) ht = ht.annotate(expected_p=(ht.idx + 1) / n) pvals = ht.aggregate( aggregators.downsample(-hail.log10(ht.expected_p), -hail.log10(ht.pval), n_divisions=n_divisions)) exp = [point[0] for point in pvals if not isnan(point[1])] obs = [point[1] for point in pvals if not isnan(point[1])] else: return ValueError('Invalid input: expression has no source') else: spvals = sorted(filter(lambda x: x and not(isnan(x)), pvals)) exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)] obs = [-log(p, 10) for p in spvals] p = figure( title='Q-Q Plot', x_axis_label='Expected p-value (-log10 scale)', y_axis_label='Observed p-value (-log10 scale)') p.scatter(x=exp, y=obs, color='black') bound = max(max(exp), max(obs)) * 1.1 p.line([0, bound], [0, bound], color='red') return p
def qq(pvals, collect_all=False, n_divisions=500): """Create a Quantile-Quantile plot. (https://en.wikipedia.org/wiki/Q-Q_plot) Parameters ---------- pvals : List[float] or :class:`.Float64Expression` P-values to be plotted. collect_all : bool Whether to collect all values or downsample before plotting. This parameter will be ignored if pvals is a Python object. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(pvals, Expression): source = pvals._indices.source if source is not None: if collect_all: pvals = pvals.collect() spvals = sorted(filter(lambda x: x and not(isnan(x)), pvals)) exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)] obs = [-log(p, 10) for p in spvals] else: if isinstance(source, Table): ht = source.select(pval=pvals).key_by().persist().key_by('pval') else: ht = source.select_rows(pval=pvals).rows().key_by().select('pval').persist().key_by('pval') n = ht.count() ht = ht.select(idx=hail.scan.count()) ht = ht.annotate(expected_p=(ht.idx + 1) / n) pvals = ht.aggregate( aggregators.downsample(-hail.log10(ht.expected_p), -hail.log10(ht.pval), n_divisions=n_divisions)) exp = [point[0] for point in pvals if not isnan(point[1])] obs = [point[1] for point in pvals if not isnan(point[1])] else: return ValueError('Invalid input: expression has no source') else: spvals = sorted(filter(lambda x: x and not(isnan(x)), pvals)) exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)] obs = [-log(p, 10) for p in spvals] p = figure( title='Q-Q Plot', x_axis_label='Expected p-value (-log10 scale)', y_axis_label='Observed p-value (-log10 scale)') p.scatter(x=exp, y=obs, color='black') bound = max(max(exp), max(obs)) * 1.1 p.line([0, bound], [0, bound], color='red') return p
def get_annotations_hists( ht: hl.Table, annotations_hists: Dict[str, Tuple], log10_annotations: List[str] = ["DP"], ) -> Dict[str, hl.expr.StructExpression]: """ Creates histograms for variant metrics in ht.info. Used when creating site quality distribution json files. :param ht: Table with variant metrics :param annotations_hists: Dictionary of metrics names and their histogram values (start, end, bins) :param log10_annotations: List of metrics to log scale :return: Dictionary of merics and their histograms :rtype: Dict[str, hl.expr.StructExpression] """ # Check all fields in ht.info and create histograms if they are in annotations_hists dict return { field: hl.agg.hist( hl.log10(ht.info[field]) if field in log10_annotations else ht.info[field], start, end, bins, ) for field, (start, end, bins) in annotations_hists.items() if field in ht.row.info }
def test_manhattan_plot(): mt = hl.balding_nichols_model(3, 10, 100) ht = mt.rows() ht = ht.annotate(pval=.02) fig = ggplot(ht, aes(x=ht.locus, y=-hl.log10(ht.pval))) + geom_point() + geom_hline(yintercept=-math.log10(5e-8)) pfig = fig.to_plotly() expected_ticks = ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y') assert pfig.layout.xaxis.ticktext == expected_ticks
def to_phred(linear_expr: hl.expr.NumericExpression) -> hl.expr.Float64Expression: """ Computes the phred-scaled value of the linear-scale input :param linear_expr: input :return: Phred-scaled value """ return -10 * hl.log10(linear_expr)
def compute_stats(stats_path: str): mt = get_gnomad_v3_mt() mt = mt.filter_entries(hl.is_defined(mt.END)) ref_block_stats = mt.aggregate_entries( hl.struct(ref_block_stats=hl.struct( stats=hl.agg.stats(mt.END - mt.locus.position), hist=hl.agg.hist(mt.END - mt.locus.position, 0, 9999, 10000), hist_log=hl.agg.hist(hl.log10(1 + mt.END - mt.locus.position), 0, 5, 100)), adj_ref_block_stats=hl.agg.filter( get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD), hl.struct(stats=hl.agg.stats(mt.END - mt.locus.position), hist=hl.agg.hist(mt.END - mt.locus.position, 0, 9999, 10000), hist_log=hl.agg.hist( hl.log10(1 + mt.END - mt.locus.position), 0, 5, 100))))) with hl.hadoop_open(stats_path, 'wb') as f: pickle.dump(ref_block_stats, f)
def main(args): ss1, p1 = import_key(args.ss1, args.ss1_chr_pos_ref_alt_p) ss2, p2 = import_key(args.ss2, args.ss2_chr_pos_ref_alt_p) ss1 = ss1.annotate(ss2=ss2[ss1.key]) x = (-hl.log10(ss1[p1])).collect() y = (-hl.log10(ss1.ss2[p2])).collect() fig, ax = plt.subplots() plt.xlabel(args.ss1_name) plt.ylabel(args.ss2_name) plt.title(args.trait) ax.scatter(x, y) lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes ] ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) out_base = args.out.split('/')[-1] fig.savefig('/tmp/' + out_base) hl.hadoop_copy('file:///tmp/' + out_base, args.out)
def compute_chet_log_like(n, p, q, x): res = (hl.cond((p > 0) & (q > 0), hl.fold( lambda i, j: i + j[0] * j[1], 0, hl.zip(gt_counts, [ hl.log10(x) * 2, hl.log10(2 * x * q), hl.log10(q) * 2, hl.log10(2 * x * p), hl.log10(2 * (p * q + x * e)), hl.log10(2 * q * e), hl.log10(p) * 2, hl.log10(2 * p * e), hl.log10(e) * 2 ])), -1e-31)) # If desired, add distance posterior based on value derived from regression if distance is not None: res = res + hl.max(-6, hl.log10(0.03 + 0.03 * hl.log(distance - 1))) return res
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def main(args): hl.init(default_reference='GRCh38', log='/variant_histograms.log') ht = hl.read_table(release_ht_path()) # NOTE: histogram aggregations are done on the entire callset (not just PASS variants), on raw data hist_dict = ANNOTATIONS_HISTS hist_dict['MQ'] = ( 20, 60, 40 ) # Boundaries changed for v3, but could be a good idea to settle on a standard hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS) # NOTE: run the following code in a first pass to determine bounds for metrics # Evaluate minimum and maximum values for each metric of interest # This doesn't need to be run unless the defaults do not result in nice-looking histograms. if args.first_pass: minmax_dict = {} for metric in hist_ranges_expr.keys(): minmax_dict[metric] = hl.struct(min=hl.agg.min(ht[metric]), max=hl.if_else( hl.agg.max(ht[metric]) < 1e10, hl.agg.max(ht[metric]), 1e10)) minmax = ht.aggregate(hl.struct(**minmax_dict)) print(minmax) else: # Aggregate hists over hand-tooled ranges hists = ht.aggregate(hl.array([ hist_expr.annotate(metric=hist_metric) for hist_metric, hist_expr in hist_ranges_expr.items() ]).extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.QUALapprox), 1, 10, 36))).map(lambda x: x[1].annotate(metric=x[0]))), _localize=False) with hl.hadoop_open(qual_hists_json_path(CURRENT_RELEASE), 'w') as f: f.write(hl.eval(hl.json(hists)))
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. significance_line : float, optional p-value at which to add a horizontal, dotted red line indicating genome-wide significance. If ``None``, no line is added. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if locus is None: locus = pvals._indices.source.locus ref = locus.dtype.reference_genome if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) source_pd = _collect_scatter_plot_data( ('_global_locus', locus.global_position()), ('_pval', pvals), fields=hover_fields, n_divisions=None if collect_all else n_divisions ) source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']] source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']] observed_contigs = set(source_pd['_contig']) observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs] contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs]) color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2)) p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000) p, _, legend, _, _, _ = _get_scatter_plot_elements( p, source_pd, x_col='_global_locus', y_col='_pval', label_cols=['_contig'], colors={'_contig': color_mapper}, size=size ) legend.visible = False p.xaxis.ticker = contig_ticks p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs)) p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')] if significance_line is not None: p.renderers.append(Span(location=-log10(significance_line), dimension='width', line_color='red', line_dash='dashed', line_width=1.5)) return p
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. Returns ------- :class:`bokeh.plotting.figure.Figure` """ def get_contig_index(x, starts): left = 0 right = len(starts) - 1 while left <= right: mid = (left + right) // 2 if x < starts[mid]: if x >= starts[mid - 1]: return mid - 1 right = mid elif x >= starts[mid + 1]: left = mid + 1 else: return mid pvals = -hail.log10(pvals) if locus is None: locus = pvals._indices.source.locus if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) res = hail.tuple( [locus.global_position(), pvals, hail.struct(**hover_fields)]).collect() hf_struct = [point[2] for point in res] for key in hover_fields: hover_fields[key] = [item[key] for item in hf_struct] x = [point[0] for point in res] y = [point[1] for point in res] ref = locus.dtype.reference_genome total_pos = 0 start_points = [] for i in range(0, len(ref.contigs)): start_points.append(total_pos) total_pos += ref.lengths.get(ref.contigs[i]) start_points.append(total_pos) # end point of all contigs observed_contigs = set() label = [] for element in x: contig_index = get_contig_index(element, start_points) label.append(str(contig_index % 2)) observed_contigs.add(ref.contigs[contig_index]) labels = ref.contigs.copy() num_deleted = 0 mid_points = [] for i in range(0, len(ref.contigs)): if ref.contigs[i] in observed_contigs: length = ref.lengths.get(ref.contigs[i]) mid = start_points[i] + length / 2 if mid % 1 == 0: mid += 0.5 mid_points.append(mid) else: del labels[i - num_deleted] num_deleted += 1 p = scatter(x, y, label=label, title=title, xlabel='Chromosome', ylabel='P-value (-log10 scale)', size=size, legend=False, source_fields=hover_fields) p.xaxis.ticker = mid_points p.xaxis.major_label_overrides = dict(zip(mid_points, labels)) p.width = 1000 tooltips = [(key, "@{}".format(key)) for key in hover_fields] tooltips.append(tuple(('p-value', "$y"))) p.add_tools(HoverTool(tooltips=tooltips)) return p
def main(args): hl.init(default_reference="GRCh38", log="/variant_histograms.log") logger.info("Loading ANNOTATIONS_HISTS dictionary...") if not file_exists(annotation_hists_path()): raise DataException( "Annotation hists JSON file not found. Need to create this JSON before running script!" ) with hl.hadoop_open(annotation_hists_path()) as a: ANNOTATIONS_HISTS = json.loads(a.read()) # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data ht = hl.read_table(release_ht_path(public=False)) ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS)) inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"] # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be # handled differently. It is stored as a dictionary in annotation_hists_path ANNOTATIONS_HISTS.remove("InbreedingCoeff") logger.info("Getting info annotation histograms...") hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS) # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary if args.determine_bounds: logger.info( "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10." ) minmax_dict = {} for metric in ANNOTATIONS_HISTS: minmax_dict[metric] = hl.struct( min=hl.agg.min(ht.info[metric]), max=hl.if_else( hl.agg.max(ht.info[metric]) < 1e10, hl.agg.max(ht.info[metric]), 1e10, ), ) minmax = ht.aggregate(hl.struct(**minmax_dict)) logger.info(f"Metrics bounds: {minmax}") else: logger.info( "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can " "be used to help define these ranges..." ) hists = ht.aggregate( hl.array( [ hist_expr.annotate(metric=hist_metric) for hist_metric, hist_expr in hist_ranges_expr.items() ] ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.QUALapprox), *ANNOTATIONS_HISTS["QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0])) ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.AS_QUALapprox), *ANNOTATIONS_HISTS["AS_QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0])) ), _localize=False, ) # Defining hist range and bins for allele frequency groups because they needed different ranges ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF)) inbreeding_hists = [ ht.aggregate( hl.agg.filter( ht.af_bin == x, hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],), ) ).annotate(metric="InbreedingCoeff" + "-" + x) for x in inbreeding_bin_ranges ] hists = hl.eval(hl.json(hists)) inbreeding_hists = hl.eval(hl.json(inbreeding_hists)) # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in # inbreeding_hists then joins them together to be written out as a single JSON hists = hists[:-1] + "," + inbreeding_hists[1:] logger.info("Writing output") with hl.hadoop_open(qual_hists_json_path(), "w") as f: f.write(hists)
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. Returns ------- :class:`bokeh.plotting.figure.Figure` """ def get_contig_index(x, starts): left = 0 right = len(starts) - 1 while left <= right: mid = (left + right) // 2 if x < starts[mid]: if x >= starts[mid - 1]: return mid - 1 right = mid elif x >= starts[mid+1]: left = mid + 1 else: return mid if locus is None: locus = pvals._indices.source.locus if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) if collect_all: res = hail.tuple([locus.global_position(), pvals, hail.struct(**hover_fields)]).collect() hf_struct = [point[2] for point in res] for key in hover_fields: hover_fields[key] = [item[key] for item in hf_struct] else: agg_f = pvals._aggregation_method() res = agg_f(aggregators.downsample(locus.global_position(), pvals, label=hail.array([hail.str(x) for x in hover_fields.values()]), n_divisions=n_divisions)) fields = [point[2] for point in res] for idx, key in enumerate(list(hover_fields.keys())): hover_fields[key] = [field[idx] for field in fields] x = [point[0] for point in res] y = [point[1] for point in res] y_linear = [10 ** (-p) for p in y] hover_fields['p_value'] = y_linear ref = locus.dtype.reference_genome total_pos = 0 start_points = [] for i in range(0, len(ref.contigs)): start_points.append(total_pos) total_pos += ref.lengths.get(ref.contigs[i]) start_points.append(total_pos) # end point of all contigs observed_contigs = set() label = [] for element in x: contig_index = get_contig_index(element, start_points) label.append(str(contig_index % 2)) observed_contigs.add(ref.contigs[contig_index]) labels = ref.contigs.copy() num_deleted = 0 mid_points = [] for i in range(0, len(ref.contigs)): if ref.contigs[i] in observed_contigs: length = ref.lengths.get(ref.contigs[i]) mid = start_points[i] + length / 2 if mid % 1 == 0: mid += 0.5 mid_points.append(mid) else: del labels[i - num_deleted] num_deleted += 1 p = scatter(x, y, label=label, title=title, xlabel='Chromosome', ylabel='P-value (-log10 scale)', size=size, legend=False, source_fields=hover_fields) p.xaxis.ticker = mid_points p.xaxis.major_label_overrides = dict(zip(mid_points, labels)) p.width = 1000 tooltips = [(key, "@{}".format(key)) for key in hover_fields] p.add_tools(HoverTool( tooltips=tooltips )) return p
#aggregation per TSS distance, eQTL p value, and MAC in GTEx ems = hl.read_table( "gs://qingbowang/ems_v1_test/ems_pcausal_gtexvg_all{0}.ht".format( tissue_name)) vg = hl.read_table( "gs://qingbowang/ems_v1_test/{0}_allpairs.ht".format(tissue_name)) vg = vg.annotate(vg=vg.variant_id + "_" + vg.gene_id) vg = vg.key_by("vg") ems = ems.join(vg, how="left") ems = ems.annotate(conf_gain_log10_bin=hl.ceil(ems.confidence_gain_log10)) #tss dist bin ems = ems.annotate( tss_dist_bin_unsigned=hl.ceil(hl.log10(hl.abs(ems.tss_distance)))) ems = ems.transmute( tss_dist_bin=hl.cond(ems.tss_distance > 0, ems.tss_dist_bin_unsigned, ems.tss_dist_bin_unsigned * -1)) agged = ems.group_by("tss_dist_bin", "conf_gain_log10_bin").aggregate(n=hl.agg.count()) agged.export("gs://qingbowang/ems_v1_test/tmp/{0}_tssdist_vs_EMS.tsv".format( tissue_name)) #p value ems = ems.annotate( pval_bin=hl.case().when(ems.pval_nominal < 5 * 10**-8, -1).when( ems.pval_nominal > 0.05, 1).default(0)) agged = ems.group_by("pval_bin", "conf_gain_log10_bin").aggregate(n=hl.agg.count()) agged.export(
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. significance_line : float, optional p-value at which to add a horizontal, dotted red line indicating genome-wide significance. If ``None``, no line is added. Returns ------- :class:`bokeh.plotting.figure.Figure` """ def get_contig_index(x, starts): left = 0 right = len(starts) - 1 while left <= right: mid = (left + right) // 2 if x < starts[mid]: if x >= starts[mid - 1]: return mid - 1 right = mid elif x >= starts[mid + 1]: left = mid + 1 else: return mid if locus is None: locus = pvals._indices.source.locus if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) if collect_all: res = hail.tuple( [locus.global_position(), pvals, hail.struct(**hover_fields)]).collect() hf_struct = [point[2] for point in res] for key in hover_fields: hover_fields[key] = [item[key] for item in hf_struct] else: agg_f = pvals._aggregation_method() res = agg_f( aggregators.downsample( locus.global_position(), pvals, label=hail.array([hail.str(x) for x in hover_fields.values()]), n_divisions=n_divisions)) fields = [point[2] for point in res] for idx, key in enumerate(list(hover_fields.keys())): hover_fields[key] = [field[idx] for field in fields] x = [point[0] for point in res] y = [point[1] for point in res] y_linear = [10**(-p) for p in y] hover_fields['p_value'] = y_linear ref = locus.dtype.reference_genome total_pos = 0 start_points = [] for i in range(0, len(ref.contigs)): start_points.append(total_pos) total_pos += ref.lengths.get(ref.contigs[i]) start_points.append(total_pos) # end point of all contigs observed_contigs = set() label = [] for element in x: contig_index = get_contig_index(element, start_points) label.append(str(contig_index % 2)) observed_contigs.add(ref.contigs[contig_index]) labels = ref.contigs.copy() num_deleted = 0 mid_points = [] for i in range(0, len(ref.contigs)): if ref.contigs[i] in observed_contigs: length = ref.lengths.get(ref.contigs[i]) mid = start_points[i] + length / 2 if mid % 1 == 0: mid += 0.5 mid_points.append(mid) else: del labels[i - num_deleted] num_deleted += 1 p = scatter(x, y, label=label, title=title, xlabel='Chromosome', ylabel='P-value (-log10 scale)', size=size, legend=False, source_fields=hover_fields) p.xaxis.ticker = mid_points p.xaxis.major_label_overrides = dict(zip(mid_points, labels)) p.width = 1000 tooltips = [(key, "@{}".format(key)) for key in hover_fields] p.add_tools(HoverTool(tooltips=tooltips)) if significance_line is not None: p.renderers.append( Span(location=-log10(significance_line), dimension='width', line_color='red', line_dash='dashed', line_width=1.5)) return p
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. significance_line : float, optional p-value at which to add a horizontal, dotted red line indicating genome-wide significance. If ``None``, no line is added. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if locus is None: locus = pvals._indices.source.locus ref = locus.dtype.reference_genome if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) source_pd = _collect_scatter_plot_data( ('_global_locus', locus.global_position()), ('_pval', pvals), fields=hover_fields, n_divisions=None if collect_all else n_divisions ) source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']] source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']] observed_contigs = set(source_pd['_contig']) observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs] contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs]) color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2)) p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000) p, _, legend, _, _, _ = _get_scatter_plot_elements( p, source_pd, x_col='_global_locus', y_col='_pval', label_cols=['_contig'], colors={'_contig': color_mapper}, size=size ) legend.visible = False p.xaxis.ticker = contig_ticks p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs)) p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')] if significance_line is not None: p.renderers.append(Span(location=-log10(significance_line), dimension='width', line_color='red', line_dash='dashed', line_width=1.5)) return p
# Filtering to X raw chromosome data mt_x_list = [mt.filter_rows(mt.locus.contig == 'X') for mt in mt_list] # From the X data, splitting by sex female_list = [mt.filter_cols(mt.reported_sex == 'F') for mt in mt_x_list] male_list = [mt.filter_cols(mt.reported_sex == 'M') for mt in mt_x_list] # Annotating the female matrix tables with variant QC data female_list = [hl.variant_qc(mt, name='variant_qc') for mt in female_list] # Annotating the male matrix tables with variant QC data male_list = [hl.variant_qc(mt, name='variant_qc') for mt in male_list] # Annotating the female matrix tables with -log10 hwe pval data female_list = [ mt.annotate_rows(log10_hwe_pval=-hl.log10(mt.variant_qc.p_value_hwe)) for mt in female_list ] # Annotating the male matrix tables with -log10 hwe pval data male_list = [ mt.annotate_rows(log10_hwe_pval=-hl.log10(mt.variant_qc.p_value_hwe)) for mt in male_list ] ''' Plotting female hwe pval distributions to understand X chrom QC MOP = Kenya Moi AAP = Ethiopia KWP = Kenya Kemri CTP = South Africa