Пример #1
0
def main(args):

    hl.init(log='/liftover.log')

    if args.gnomad:
        gnomad = True
        path = None

        if args.exomes:
            data_type = 'exomes'
        if args.genomes:
            data_type = 'genomes'

        logger.info('Working on gnomAD {} release ht'.format(data_type))
        logger.info('Reading in release ht')
        t = public_release(data_type).ht()
        logger.info('Variants in release ht: {}'.format(t.count()))

    else:
        data_type = None
        gnomad = False

        if args.ht:
            path = args.ht
            t = hl.read_table(args.ht)
        if args.mt:
            path = args.mt
            t = hl.read_matrix_table(args.mt)

    logger.info('Checking if input data has been split')
    if 'was_split' not in t.row:
        t = hl.split_multi(t) if isinstance(
            t, hl.Table) else hl.split_multi_hts(t)

    logger.info('Preparing reference genomes for liftover')
    source, target = get_liftover_genome(t)

    if args.test:
        logger.info('Filtering to chr21 for testing')
        if source.name == 'GRCh38':
            contig = 'chr21'
        else:
            contig = '21'
        t = hl.filter_intervals(
            t, [hl.parse_locus_interval(contig, reference_genome=source.name)])

    logger.info(f'Lifting data to {target.name}')
    t = lift_data(t, gnomad, data_type, path, target, args.overwrite)

    logger.info('Checking SNPs for reference mismatches')
    t = annotate_snp_mismatch(t, data_type, target)

    mismatch = check_mismatch(t) if isinstance(
        t, hl.Table) else check_mismatch(t.rows())
    logger.info('{} total SNPs'.format(mismatch['total_variants']))
    logger.info('{} SNPs on minus strand'.format(mismatch['negative_strand']))
    logger.info('{} reference mismatches in SNPs'.format(
        mismatch['total_mismatch']))
    logger.info('{} mismatches on minus strand'.format(
        mismatch['negative_strand_mismatch']))
def prepare_gnomad_stats(overwrite):
    ht = hl.read_table(get_ukb_vep_path())
    gwased_variants_ht = hl.read_matrix_table(
        get_variant_results_path('full')).rows()
    ht = ht.filter(hl.is_defined(gwased_variants_ht[ht.key]))
    datasets = ('exomes', 'genomes')
    gnomad_hts = {
        dataset: gnomad.public_release(dataset).ht()
        for dataset in datasets
    }
    gnomad_index_dicts = {
        k: gnomad_ht.index_globals().freq_index_dict
        for k, gnomad_ht in gnomad_hts.items()
    }
    gnomads = {k: gnomad_ht[ht.key] for k, gnomad_ht in gnomad_hts.items()}
    ht = ht.annotate(freq=[
        hl.struct(pop=pop,
                  ac=ht.af[pop] * ht.an[pop],
                  af=ht.af[pop] / 2,
                  an=ht.an[pop] * 2,
                  **{
                      f'gnomad_{dataset}_{metric.lower()}':
                      gnomads[dataset].freq[gnomad_index_dicts[dataset].get(
                          f'gnomad_{UKB_GNOMAD_POP_MAPPING[pop]}')][metric]
                      for dataset in datasets for metric in ('AC', 'AF', 'AN')
                  }) for pop in POPS
    ],
                     **{
                         f'pass_gnomad_{dataset}':
                         hl.len(gnomads[dataset].filters) == 0
                         for dataset in datasets
                     })
    ht.naive_coalesce(1000).write(get_analysis_data_path(
        'gnomad_comparison', 'qc', 'full', 'ht'),
                                  overwrite=overwrite)
Пример #3
0
def get_r_within_gene(
    bm: BlockMatrix,
    ld_index: hl.Table,
    gene: str,
    vep_ht: hl.Table = None,
    reference_genome: str = None,
):
    """
    Get LD information (`r`) for all pairs of variants within `gene`.

    Warning: this returns a table quadratic in number of variants. Exercise caution with large genes.

    :param bm: Input Block Matrix
    :param ld_index: Corresponding index table
    :param gene: Gene symbol as string
    :param vep_ht: Table with VEP annotations (if None, gets from get_gnomad_public_data())
    :param reference_genome: Reference genome to pass to get_gene_intervals for fast filtering to gene
    :return: Table with pairs of variants
    """
    if vep_ht is None:
        vep_ht = public_release("exomes").ht()
    if reference_genome is None:
        reference_genome = hl.default_reference().name
    intervals = hl.experimental.get_gene_intervals(
        gene_symbols=[gene], reference_genome=reference_genome)
    ld_index = hl.filter_intervals(ld_index, intervals)
    ld_index = ld_index.annotate(vep=vep_ht[ld_index.key].vep)
    ld_index = ld_index.filter(
        hl.any(lambda tc: tc.gene_symbol == gene,
               ld_index.vep.transcript_consequences))

    indices_to_keep = ld_index.idx.collect()
    filt_bm = bm.filter(indices_to_keep, indices_to_keep)
    ht = filt_bm.entries()
    ld_index = ld_index.add_index("new_idx").key_by("new_idx")
    return ht.transmute(r=ht.entry,
                        i_variant=ld_index[ht.i],
                        j_variant=ld_index[ht.j])
Пример #4
0
def annotate_unphased_pairs(unphased_ht: hl.Table, n_variant_pairs: int,
                            least_consequence: str, max_af: float):
    # unphased_ht = vp_ht.filter(hl.is_missing(vp_ht.all_phase))
    # unphased_ht = unphased_ht.key_by()

    # Explode variant pairs
    unphased_ht = unphased_ht.annotate(las=[
        hl.tuple([unphased_ht.locus1, unphased_ht.alleles1]),
        hl.tuple([unphased_ht.locus2, unphased_ht.alleles2])
    ]).explode('las', name='la')

    unphased_ht = unphased_ht.key_by(
        locus=unphased_ht.la[0], alleles=unphased_ht.la[1]).persist(
        )  # .checkpoint('gs://gnomad-tmp/vp_ht_unphased.ht')

    # Annotate single variants with gnomAD freq
    gnomad_ht = gnomad.public_release('exomes').ht()
    gnomad_ht = gnomad_ht.semi_join(unphased_ht).repartition(
        ceil(n_variant_pairs / 10000), shuffle=True).persist()

    missing_freq = hl.struct(
        AC=0,
        AF=0,
        AN=125748 * 2,  # set to no missing for now
        homozygote_count=0)

    logger.info(
        f"{gnomad_ht.count()}/{unphased_ht.count()} single variants from the unphased pairs found in gnomAD."
    )

    gnomad_indexed = gnomad_ht[unphased_ht.key]
    gnomad_freq = gnomad_indexed.freq
    unphased_ht = unphased_ht.annotate(
        adj_freq=hl.or_else(gnomad_freq[0], missing_freq),
        raw_freq=hl.or_else(gnomad_freq[1], missing_freq),
        vep_genes=vep_genes_expr(gnomad_indexed.vep, least_consequence),
        max_af_filter=gnomad_indexed.freq[0].AF <= max_af
        # pop_max_freq=hl.or_else(
        #     gnomad_exomes.popmax[0],
        #     missing_freq.annotate(
        #         pop=hl.null(hl.tstr)
        #     )
        # )
    )
    unphased_ht = unphased_ht.persist()
    # unphased_ht = unphased_ht.checkpoint('gs://gnomad-tmp/unphased_ann.ht', overwrite=True)

    loci_expr = hl.sorted(
        hl.agg.collect(
            hl.tuple([
                unphased_ht.locus,
                hl.struct(
                    adj_freq=unphased_ht.adj_freq,
                    raw_freq=unphased_ht.raw_freq,
                    # pop_max_freq=unphased_ht.pop_max_freq
                )
            ])),
        lambda x: x[0]  # sort by locus
    ).map(lambda x: x[1]  # get rid of locus
          )

    vp_freq_expr = hl.struct(v1=loci_expr[0], v2=loci_expr[1])

    # [AABB, AABb, AAbb, AaBB, AaBb, Aabb, aaBB, aaBb, aabb]
    def get_gt_counts(freq: str):
        return hl.array([
            hl.min(vp_freq_expr.v1[freq].AN, vp_freq_expr.v2[freq].AN),  # AABB
            vp_freq_expr.v2[freq].AC -
            (2 * vp_freq_expr.v2[freq].homozygote_count),  # AABb
            vp_freq_expr.v2[freq].homozygote_count,  # AAbb
            vp_freq_expr.v1[freq].AC -
            (2 * vp_freq_expr.v1[freq].homozygote_count),  # AaBB
            0,  # AaBb
            0,  # Aabb
            vp_freq_expr.v1[freq].homozygote_count,  # aaBB
            0,  # aaBb
            0  # aabb
        ])

    gt_counts_raw_expr = get_gt_counts('raw_freq')
    gt_counts_adj_expr = get_gt_counts('adj_freq')

    # gt_counts_pop_max_expr = get_gt_counts('pop_max_freq')
    unphased_ht = unphased_ht.group_by(
        unphased_ht.locus1, unphased_ht.alleles1, unphased_ht.locus2,
        unphased_ht.alleles2
    ).aggregate(
        pop='all',  # TODO Add option for multiple pops?
        phase_info=hl.struct(gt_counts=hl.struct(raw=gt_counts_raw_expr,
                                                 adj=gt_counts_adj_expr),
                             em=hl.struct(
                                 raw=get_em_expr(gt_counts_raw_expr),
                                 adj=get_em_expr(gt_counts_raw_expr))),
        vep_genes=hl.agg.collect(
            unphased_ht.vep_genes).filter(lambda x: hl.len(x) > 0),
        max_af_filter=hl.agg.all(unphased_ht.max_af_filter)

        # pop_max_gt_counts_adj=gt_counts_raw_expr,
        # pop_max_em_p_chet_adj=get_em_expr(gt_counts_raw_expr).p_chet,
    )  # .key_by()

    unphased_ht = unphased_ht.transmute(
        vep_filter=(hl.len(unphased_ht.vep_genes) > 1)
        & (hl.len(unphased_ht.vep_genes[0].intersection(
            unphased_ht.vep_genes[1])) > 0))

    max_af_filtered, vep_filtered = unphased_ht.aggregate([
        hl.agg.count_where(~unphased_ht.max_af_filter),
        hl.agg.count_where(~unphased_ht.vep_filter)
    ])
    if max_af_filtered > 0:
        logger.info(
            f"{max_af_filtered} variant-pairs excluded because the AF of at least one variant was > {max_af}"
        )
    if vep_filtered > 0:
        logger.info(
            f"{vep_filtered} variant-pairs excluded because the variants were not found within the same gene with a csq of at least {least_consequence}"
        )

    unphased_ht = unphased_ht.filter(unphased_ht.max_af_filter
                                     & unphased_ht.vep_filter)

    return unphased_ht.drop('max_af_filter', 'vep_filter')