def summarize(mt): """Computes summary statistics Note ---- You will not be able to run :func:`.combine_gvcfs` with the output of this function. """ mt = hl.experimental.densify(mt) return mt.annotate_rows(info=hl.rbind( hl.agg.call_stats(lgt_to_gt(mt.LGT, mt.LA), mt.alleles), lambda gs: hl.struct( # here, we alphabetize the INFO fields by GATK convention AC=gs.AC[ 1: ], # The VCF spec indicates that AC and AF have Number=A, so we need AF=gs.AF[1:], # to drop the first element from each of these. AN=gs.AN, BaseQRankSum=hl.median(hl.agg.collect(mt.entry.BaseQRankSum)), ClippingRankSum=hl.median(hl.agg.collect(mt.entry.ClippingRankSum) ), DP=hl.agg.sum(mt.entry.DP), MQ=hl.median(hl.agg.collect(mt.entry.MQ)), MQRankSum=hl.median(hl.agg.collect(mt.entry.MQRankSum)), MQ_DP=mt.info.MQ_DP, QUALapprox=mt.info.QUALapprox, RAW_MQ=mt.info.RAW_MQ, ReadPosRankSum=hl.median(hl.agg.collect(mt.entry.ReadPosRankSum)), SB_TABLE=mt.info.SB_TABLE, VarDP=mt.info.VarDP, )))
def summarize(mt): """Computes summary statistics Calls :func:`.quick_summary`. Calling both this and :func:`.quick_summary`, will lead to :func:`.quick_summary` being executed twice. Note ---- You will not be able to run :func:`.combine_gvcfs` with the output of this function. """ mt = quick_summary(mt) mt = hl.experimental.densify(mt) return mt.annotate_rows(info=hl.rbind( hl.agg.call_stats(lgt_to_gt(mt.LGT, mt.LA), mt.alleles), lambda gs: hl.struct( # here, we alphabetize the INFO fields by GATK convention AC=gs.AC[1:], # The VCF spec indicates that AC and AF have Number=A, so we need AF=gs.AF[1:], # to drop the first element from each of these. AN=gs.AN, BaseQRankSum=hl.median(hl.agg.collect(mt.entry.gvcf_info.BaseQRankSum)), ClippingRankSum=hl.median(hl.agg.collect(mt.entry.gvcf_info.ClippingRankSum)), DP=hl.agg.sum(mt.entry.DP), MQ=hl.median(hl.agg.collect(mt.entry.gvcf_info.MQ)), MQRankSum=hl.median(hl.agg.collect(mt.entry.gvcf_info.MQRankSum)), MQ_DP=mt.info.MQ_DP, QUALapprox=mt.info.QUALapprox, RAW_MQ=mt.info.RAW_MQ, ReadPosRankSum=hl.median(hl.agg.collect(mt.entry.gvcf_info.ReadPosRankSum)), SB_TABLE=mt.info.SB_TABLE, VarDP=mt.info.VarDP, )))
def get_metric_expr(ht, metric): metric_values = hl.agg.collect(ht[metric]) metric_median = hl.median(metric_values) metric_mad = 1.4826 * hl.median(hl.abs(metric_values - metric_median)) return hl.struct(median=metric_median, mad=metric_mad, upper=metric_median + 4 * metric_mad if metric != 'callrate' else 1, lower=metric_median - 4 * metric_mad if metric != 'callrate' else 0.99)
def summarize(mt): mt = densify(mt) return mt.annotate_rows(info=mt.info.annotate( DP=hl.agg.sum( mt.entry.DP ), # some DPs may have been missing during earlier combining operations BaseQRankSum=hl.median(hl.agg.collect(mt.entry.BaseQRankSum)), ClippingRankSum=hl.median(hl.agg.collect(mt.entry.ClippingRankSum)), MQ=hl.median(hl.agg.collect(mt.entry.MQ)), MQRankSum=hl.median(hl.agg.collect(mt.entry.MQRankSum)), ReadPosRankSum=hl.median(hl.agg.collect(mt.entry.ReadPosRankSum)), ))
def get_median_and_mad_expr( metric_expr: hl.expr.ArrayNumericExpression, k: float = 1.4826 ) -> hl.expr.StructExpression: """ Computes the median and median absolute deviation (MAD) for the given expression. Note that the default value of k assumes normally distributed data. :param metric_expr: Expression to compute median and MAD for :param k: The scaling factor for MAD calculation. Default assumes normally distributed data. :return: Struct with median and MAD """ return hl.bind( lambda x: hl.struct(median=x[1], mad=k * hl.median(hl.abs(x[0] - x[1]))), hl.bind(lambda x: hl.tuple([x, hl.median(x)]), hl.agg.collect(metric_expr)), )
def gnomad_coverage_stats(mt_path): mt = hl.read_matrix_table(mt_path) def get_coverage_expr(mt): cov_arrays = hl.literal({ x: [1, 1, 1, 1, 1, 1, 1, 1, 0] if x >= 50 else [1, 1, 1, 1, 1, 1, 1, 0, 0] if x >= 30 else ([1] * (i + 2)) + ([0] * (7 - i)) for i, x in enumerate(range(5, 100, 5)) }) return hl.bind( lambda array_expr: hl.struct( **{ f'over_{x}': hl.int32(array_expr[i]) for i, x in enumerate([1, 5, 10, 15, 20, 25, 30, 50, 100]) }), hl.agg.array_sum(hl.case().when( mt.x >= 100, [1, 1, 1, 1, 1, 1, 1, 1, 1]).when( mt.x >= 5, cov_arrays[mt.x - (mt.x % 5)]).when( mt.x >= 1, [1, 0, 0, 0, 0, 0, 0, 0, 0]).default( [0, 0, 0, 0, 0, 0, 0, 0, 0]))) mt = mt.annotate_rows(mean=hl.agg.mean(mt.x), median=hl.median(hl.agg.collect(mt.x)), **get_coverage_expr(mt)) mt.rows()._force_count()
def f3stats(ht): return ht.aggregate( hl.struct( n=hl.agg.count_where(hl.is_defined(ht["feature3"])), med=hl.median(hl.agg.collect(ht["feature3"])), ) )
def summarize(mt): mt = hl.experimental.densify(mt) return mt.annotate_rows(info=hl.rbind( hl.agg.call_stats(lgt_to_gt(mt.LGT, mt.LA), mt.alleles), lambda gs: hl.struct( # here, we alphabetize the INFO fields by GATK convention AC=gs.AC, AF=gs.AF, AN=gs.AN, BaseQRankSum=hl.median(hl.agg.collect(mt.entry.BaseQRankSum)), ClippingRankSum=hl.median(hl.agg.collect(mt.entry.ClippingRankSum)), DP=hl.agg.sum(mt.entry.DP), MQ=hl.median(hl.agg.collect(mt.entry.MQ)), MQRankSum=hl.median(hl.agg.collect(mt.entry.MQRankSum)), MQ_DP=mt.info.MQ_DP, QUALapprox=mt.info.QUALapprox, RAW_MQ=mt.info.RAW_MQ, ReadPosRankSum=hl.median(hl.agg.collect(mt.entry.ReadPosRankSum)), SB_TABLE=mt.info.SB_TABLE, VarDP=mt.info.VarDP, )))
def get_gtex_summary(gtex_rsem_path, gtex_tx_summary_out_path, get_medians=True): """ Get GTEx RSEM table with ENSTs and ENSGs as rows and GTEx samples as columns (e.g. Muscle-Skeletal.12, Adipose.27 etc.) and write out a table with same rows, and tissues as columns (Muscle-Skeletal, Adipose etc.) with cells representing summary expression of transcripts across tissues (ie. mean or median). :param str gtex_rsem_path: Output of RSEM quantifications from GTEx Example: "gs://gnomad-berylc/reheadered.GTEx_Analysis_2016-09-07_RSEMv1.2.22_transcript_tpm.txt.bgz" :param str gtex_tx_summary_out_path: Path to write out. Example: "gs://gnomad-berylc/tx-annotation/hail2/GTEx.V7.tx_medians.030818.mt" :param bool get_medians: Default True. If False, returns mean transcript expression per tissue :return: Writes out summarized GTEx transcript expression as Table. :rtype: None """ gtex = hl.import_matrix_table(gtex_rsem_path, row_key='transcript_id', row_fields={ 'transcript_id': hl.tstr, 'gene_id': hl.tstr }, entry_type=hl.tfloat64) gtex = gtex.annotate_cols(tissue=gtex.col_id.split("\\.")[0]) if get_medians: gtex = gtex.group_cols_by(gtex.tissue).aggregate( median_tx_expr=hl.median(agg.collect(gtex.x))) else: gtex = gtex.group_cols_by( gtex.tissue).aggregate(mean_tx_expr=hl.mean(agg.collect(gtex.x))) # Make a new column as an array of the values across tissues (per transcript) gtex = gtex.annotate_rows(agg_expression=agg.collect(gtex.median_tx_expr)) # Modify the gtex table to remove version numbers gtex = gtex.annotate_rows(transcript_id=gtex.transcript_id.split("\\.")[0]) gtex = gtex.annotate_rows(gene_id=gtex.gene_id.split("\\.")[0]) gtex.write(gtex_tx_summary_out_path, overwrite=True)
def main(args): input_tsv = args.input_tsv output_ht = args.output_ht chunk_size = args.chunk_size overwrite = args.overwrite mt_list = [] logger.info( "Reading in individual coverage files as matrix tables and adding to a list of matrix tables..." ) with open(input_tsv, "r") as f: #next(f) for line in f: line = line.rstrip() items = line.split("\t") sample, base_level_coverage_metrics = items[0:2] #print(sample) #print(base_level_coverage_metrics) mt = hl.import_matrix_table( base_level_coverage_metrics, delimiter="\t", row_fields={ "chrom": hl.tstr, "pos": hl.tint, "target": hl.tstr }, row_key=["chrom", "pos"], ).drop("target") mt = mt.rename({"x": "coverage"}) mt = mt.key_cols_by(s=sample) mt_list.append(mt) logger.info("Joining individual coverage mts...") out_dir = dirname(output_ht) temp_out_dir = out_dir + "/temp" cov_mt = multi_way_union_mts(mt_list, temp_out_dir, chunk_size) n_samples = cov_mt.count_cols() logger.info("Adding coverage annotations...") cov_mt = cov_mt.annotate_rows( locus=hl.locus(cov_mt.chrom, cov_mt.pos, reference_genome="GRCh38"), mean=hl.float(hl.agg.mean(cov_mt.coverage)), median=hl.median(hl.agg.collect(cov_mt.coverage)), over_100=hl.float( (hl.agg.count_where(cov_mt.coverage > 100) / n_samples)), over_1000=hl.float( (hl.agg.count_where(cov_mt.coverage > 1000) / n_samples)), ) cov_mt.show() cov_mt = cov_mt.key_rows_by("locus").drop("chrom", "pos") output_mt = re.sub("\.ht$", ".mt", output_ht) output_tsv = re.sub("\.ht$", ".tsv", output_ht) output_samples = re.sub("\.ht$", "_sample_level.txt", output_ht) logger.info("Writing sample level coverage...") sample_mt = cov_mt.key_rows_by(pos=cov_mt.locus.position) sample_mt.coverage.export(output_samples) logger.info("Writing coverage mt and ht...") cov_mt.write(output_mt, overwrite=overwrite) cov_ht = cov_mt.rows() cov_ht = cov_ht.checkpoint(output_ht, overwrite=overwrite) cov_ht.export(output_tsv)