Exemplo n.º 1
0
def summarize(mt):
    """Computes summary statistics

    Note
    ----
    You will not be able to run :func:`.combine_gvcfs` with the output of this
    function.
    """
    mt = hl.experimental.densify(mt)
    return mt.annotate_rows(info=hl.rbind(
        hl.agg.call_stats(lgt_to_gt(mt.LGT, mt.LA), mt.alleles),
        lambda gs: hl.struct(
            # here, we alphabetize the INFO fields by GATK convention
            AC=gs.AC[
                1:
            ],  # The VCF spec indicates that AC and AF have Number=A, so we need
            AF=gs.AF[1:],  # to drop the first element from each of these.
            AN=gs.AN,
            BaseQRankSum=hl.median(hl.agg.collect(mt.entry.BaseQRankSum)),
            ClippingRankSum=hl.median(hl.agg.collect(mt.entry.ClippingRankSum)
                                      ),
            DP=hl.agg.sum(mt.entry.DP),
            MQ=hl.median(hl.agg.collect(mt.entry.MQ)),
            MQRankSum=hl.median(hl.agg.collect(mt.entry.MQRankSum)),
            MQ_DP=mt.info.MQ_DP,
            QUALapprox=mt.info.QUALapprox,
            RAW_MQ=mt.info.RAW_MQ,
            ReadPosRankSum=hl.median(hl.agg.collect(mt.entry.ReadPosRankSum)),
            SB_TABLE=mt.info.SB_TABLE,
            VarDP=mt.info.VarDP,
        )))
Exemplo n.º 2
0
def summarize(mt):
    """Computes summary statistics

    Calls :func:`.quick_summary`. Calling both this and :func:`.quick_summary`, will lead
    to :func:`.quick_summary` being executed twice.

    Note
    ----
    You will not be able to run :func:`.combine_gvcfs` with the output of this
    function.
    """
    mt = quick_summary(mt)
    mt = hl.experimental.densify(mt)
    return mt.annotate_rows(info=hl.rbind(
        hl.agg.call_stats(lgt_to_gt(mt.LGT, mt.LA), mt.alleles),
        lambda gs: hl.struct(
            # here, we alphabetize the INFO fields by GATK convention
            AC=gs.AC[1:],  # The VCF spec indicates that AC and AF have Number=A, so we need
            AF=gs.AF[1:],  # to drop the first element from each of these.
            AN=gs.AN,
            BaseQRankSum=hl.median(hl.agg.collect(mt.entry.gvcf_info.BaseQRankSum)),
            ClippingRankSum=hl.median(hl.agg.collect(mt.entry.gvcf_info.ClippingRankSum)),
            DP=hl.agg.sum(mt.entry.DP),
            MQ=hl.median(hl.agg.collect(mt.entry.gvcf_info.MQ)),
            MQRankSum=hl.median(hl.agg.collect(mt.entry.gvcf_info.MQRankSum)),
            MQ_DP=mt.info.MQ_DP,
            QUALapprox=mt.info.QUALapprox,
            RAW_MQ=mt.info.RAW_MQ,
            ReadPosRankSum=hl.median(hl.agg.collect(mt.entry.gvcf_info.ReadPosRankSum)),
            SB_TABLE=mt.info.SB_TABLE,
            VarDP=mt.info.VarDP,
        )))
Exemplo n.º 3
0
 def get_metric_expr(ht, metric):
     metric_values = hl.agg.collect(ht[metric])
     metric_median = hl.median(metric_values)
     metric_mad = 1.4826 * hl.median(hl.abs(metric_values - metric_median))
     return hl.struct(median=metric_median,
                      mad=metric_mad,
                      upper=metric_median +
                      4 * metric_mad if metric != 'callrate' else 1,
                      lower=metric_median -
                      4 * metric_mad if metric != 'callrate' else 0.99)
Exemplo n.º 4
0
def summarize(mt):
    mt = densify(mt)
    return mt.annotate_rows(info=mt.info.annotate(
        DP=hl.agg.sum(
            mt.entry.DP
        ),  # some DPs may have been missing during earlier combining operations
        BaseQRankSum=hl.median(hl.agg.collect(mt.entry.BaseQRankSum)),
        ClippingRankSum=hl.median(hl.agg.collect(mt.entry.ClippingRankSum)),
        MQ=hl.median(hl.agg.collect(mt.entry.MQ)),
        MQRankSum=hl.median(hl.agg.collect(mt.entry.MQRankSum)),
        ReadPosRankSum=hl.median(hl.agg.collect(mt.entry.ReadPosRankSum)),
    ))
Exemplo n.º 5
0
def get_median_and_mad_expr(
    metric_expr: hl.expr.ArrayNumericExpression, k: float = 1.4826
) -> hl.expr.StructExpression:
    """
    Computes the median and median absolute deviation (MAD) for the given expression.
    Note that the default value of k assumes normally distributed data.

    :param metric_expr: Expression to compute median and MAD for
    :param k: The scaling factor for MAD calculation. Default assumes normally distributed data.
    :return: Struct with median and MAD
    """
    return hl.bind(
        lambda x: hl.struct(median=x[1], mad=k * hl.median(hl.abs(x[0] - x[1]))),
        hl.bind(lambda x: hl.tuple([x, hl.median(x)]), hl.agg.collect(metric_expr)),
    )
Exemplo n.º 6
0
def gnomad_coverage_stats(mt_path):
    mt = hl.read_matrix_table(mt_path)

    def get_coverage_expr(mt):
        cov_arrays = hl.literal({
            x: [1, 1, 1, 1, 1, 1, 1, 1, 0]
            if x >= 50 else [1, 1, 1, 1, 1, 1, 1, 0, 0] if x >= 30 else
            ([1] * (i + 2)) + ([0] * (7 - i))
            for i, x in enumerate(range(5, 100, 5))
        })

        return hl.bind(
            lambda array_expr: hl.struct(
                **{
                    f'over_{x}': hl.int32(array_expr[i])
                    for i, x in enumerate([1, 5, 10, 15, 20, 25, 30, 50, 100])
                }),
            hl.agg.array_sum(hl.case().when(
                mt.x >= 100, [1, 1, 1, 1, 1, 1, 1, 1, 1]).when(
                    mt.x >= 5, cov_arrays[mt.x - (mt.x % 5)]).when(
                        mt.x >= 1, [1, 0, 0, 0, 0, 0, 0, 0, 0]).default(
                            [0, 0, 0, 0, 0, 0, 0, 0, 0])))

    mt = mt.annotate_rows(mean=hl.agg.mean(mt.x),
                          median=hl.median(hl.agg.collect(mt.x)),
                          **get_coverage_expr(mt))
    mt.rows()._force_count()
Exemplo n.º 7
0
 def f3stats(ht):
     return ht.aggregate(
         hl.struct(
             n=hl.agg.count_where(hl.is_defined(ht["feature3"])),
             med=hl.median(hl.agg.collect(ht["feature3"])),
         )
     )
Exemplo n.º 8
0
def summarize(mt):
    mt = hl.experimental.densify(mt)
    return mt.annotate_rows(info=hl.rbind(
        hl.agg.call_stats(lgt_to_gt(mt.LGT, mt.LA), mt.alleles),
        lambda gs: hl.struct(
            # here, we alphabetize the INFO fields by GATK convention
            AC=gs.AC,
            AF=gs.AF,
            AN=gs.AN,
            BaseQRankSum=hl.median(hl.agg.collect(mt.entry.BaseQRankSum)),
            ClippingRankSum=hl.median(hl.agg.collect(mt.entry.ClippingRankSum)),
            DP=hl.agg.sum(mt.entry.DP),
            MQ=hl.median(hl.agg.collect(mt.entry.MQ)),
            MQRankSum=hl.median(hl.agg.collect(mt.entry.MQRankSum)),
            MQ_DP=mt.info.MQ_DP,
            QUALapprox=mt.info.QUALapprox,
            RAW_MQ=mt.info.RAW_MQ,
            ReadPosRankSum=hl.median(hl.agg.collect(mt.entry.ReadPosRankSum)),
            SB_TABLE=mt.info.SB_TABLE,
            VarDP=mt.info.VarDP,
        )))
Exemplo n.º 9
0
def get_gtex_summary(gtex_rsem_path,
                     gtex_tx_summary_out_path,
                     get_medians=True):
    """
    Get GTEx RSEM table with ENSTs and ENSGs as rows and GTEx samples as columns (e.g. Muscle-Skeletal.12,
    Adipose.27 etc.) and write out a table with same rows, and tissues as columns (Muscle-Skeletal, Adipose etc.)
    with cells representing summary expression of transcripts across tissues (ie. mean or median).

    :param str gtex_rsem_path: Output of RSEM quantifications from GTEx
    Example: "gs://gnomad-berylc/reheadered.GTEx_Analysis_2016-09-07_RSEMv1.2.22_transcript_tpm.txt.bgz"
    :param str gtex_tx_summary_out_path: Path to write out.
    Example: "gs://gnomad-berylc/tx-annotation/hail2/GTEx.V7.tx_medians.030818.mt"
    :param bool get_medians: Default True. If False, returns mean transcript expression per tissue
    :return: Writes out summarized GTEx transcript expression as Table.
    :rtype: None
    """

    gtex = hl.import_matrix_table(gtex_rsem_path,
                                  row_key='transcript_id',
                                  row_fields={
                                      'transcript_id': hl.tstr,
                                      'gene_id': hl.tstr
                                  },
                                  entry_type=hl.tfloat64)

    gtex = gtex.annotate_cols(tissue=gtex.col_id.split("\\.")[0])

    if get_medians:
        gtex = gtex.group_cols_by(gtex.tissue).aggregate(
            median_tx_expr=hl.median(agg.collect(gtex.x)))
    else:
        gtex = gtex.group_cols_by(
            gtex.tissue).aggregate(mean_tx_expr=hl.mean(agg.collect(gtex.x)))

    # Make a new column as an array of the values across tissues (per transcript)
    gtex = gtex.annotate_rows(agg_expression=agg.collect(gtex.median_tx_expr))

    # Modify the gtex table to remove version numbers
    gtex = gtex.annotate_rows(transcript_id=gtex.transcript_id.split("\\.")[0])
    gtex = gtex.annotate_rows(gene_id=gtex.gene_id.split("\\.")[0])

    gtex.write(gtex_tx_summary_out_path, overwrite=True)
Exemplo n.º 10
0
def main(args):
    input_tsv = args.input_tsv
    output_ht = args.output_ht
    chunk_size = args.chunk_size
    overwrite = args.overwrite

    mt_list = []
    logger.info(
        "Reading in individual coverage files as matrix tables and adding to a list of matrix tables..."
    )
    with open(input_tsv, "r") as f:
        #next(f)
        for line in f:
            line = line.rstrip()
            items = line.split("\t")
            sample, base_level_coverage_metrics = items[0:2]
            #print(sample)
            #print(base_level_coverage_metrics)

            mt = hl.import_matrix_table(
                base_level_coverage_metrics,
                delimiter="\t",
                row_fields={
                    "chrom": hl.tstr,
                    "pos": hl.tint,
                    "target": hl.tstr
                },
                row_key=["chrom", "pos"],
            ).drop("target")
            mt = mt.rename({"x": "coverage"})
            mt = mt.key_cols_by(s=sample)
            mt_list.append(mt)

    logger.info("Joining individual coverage mts...")
    out_dir = dirname(output_ht)
    temp_out_dir = out_dir + "/temp"

    cov_mt = multi_way_union_mts(mt_list, temp_out_dir, chunk_size)
    n_samples = cov_mt.count_cols()

    logger.info("Adding coverage annotations...")
    cov_mt = cov_mt.annotate_rows(
        locus=hl.locus(cov_mt.chrom, cov_mt.pos, reference_genome="GRCh38"),
        mean=hl.float(hl.agg.mean(cov_mt.coverage)),
        median=hl.median(hl.agg.collect(cov_mt.coverage)),
        over_100=hl.float(
            (hl.agg.count_where(cov_mt.coverage > 100) / n_samples)),
        over_1000=hl.float(
            (hl.agg.count_where(cov_mt.coverage > 1000) / n_samples)),
    )
    cov_mt.show()

    cov_mt = cov_mt.key_rows_by("locus").drop("chrom", "pos")

    output_mt = re.sub("\.ht$", ".mt", output_ht)
    output_tsv = re.sub("\.ht$", ".tsv", output_ht)
    output_samples = re.sub("\.ht$", "_sample_level.txt", output_ht)

    logger.info("Writing sample level coverage...")
    sample_mt = cov_mt.key_rows_by(pos=cov_mt.locus.position)
    sample_mt.coverage.export(output_samples)

    logger.info("Writing coverage mt and ht...")
    cov_mt.write(output_mt, overwrite=overwrite)
    cov_ht = cov_mt.rows()
    cov_ht = cov_ht.checkpoint(output_ht, overwrite=overwrite)
    cov_ht.export(output_tsv)