def main(args): hl.init(log='/liftover.log') if args.gnomad: gnomad = True path = None if args.exomes: data_type = 'exomes' if args.genomes: data_type = 'genomes' logger.info('Working on gnomAD {} release ht'.format(data_type)) logger.info('Reading in release ht') t = public_release(data_type).ht() logger.info('Variants in release ht: {}'.format(t.count())) else: data_type = None gnomad = False if args.ht: path = args.ht t = hl.read_table(args.ht) if args.mt: path = args.mt t = hl.read_matrix_table(args.mt) logger.info('Checking if input data has been split') if 'was_split' not in t.row: t = hl.split_multi(t) if isinstance( t, hl.Table) else hl.split_multi_hts(t) logger.info('Preparing reference genomes for liftover') source, target = get_liftover_genome(t) if args.test: logger.info('Filtering to chr21 for testing') if source.name == 'GRCh38': contig = 'chr21' else: contig = '21' t = hl.filter_intervals( t, [hl.parse_locus_interval(contig, reference_genome=source.name)]) logger.info(f'Lifting data to {target.name}') t = lift_data(t, gnomad, data_type, path, target, args.overwrite) logger.info('Checking SNPs for reference mismatches') t = annotate_snp_mismatch(t, data_type, target) mismatch = check_mismatch(t) if isinstance( t, hl.Table) else check_mismatch(t.rows()) logger.info('{} total SNPs'.format(mismatch['total_variants'])) logger.info('{} SNPs on minus strand'.format(mismatch['negative_strand'])) logger.info('{} reference mismatches in SNPs'.format( mismatch['total_mismatch'])) logger.info('{} mismatches on minus strand'.format( mismatch['negative_strand_mismatch']))
def prepare_gnomad_stats(overwrite): ht = hl.read_table(get_ukb_vep_path()) gwased_variants_ht = hl.read_matrix_table( get_variant_results_path('full')).rows() ht = ht.filter(hl.is_defined(gwased_variants_ht[ht.key])) datasets = ('exomes', 'genomes') gnomad_hts = { dataset: gnomad.public_release(dataset).ht() for dataset in datasets } gnomad_index_dicts = { k: gnomad_ht.index_globals().freq_index_dict for k, gnomad_ht in gnomad_hts.items() } gnomads = {k: gnomad_ht[ht.key] for k, gnomad_ht in gnomad_hts.items()} ht = ht.annotate(freq=[ hl.struct(pop=pop, ac=ht.af[pop] * ht.an[pop], af=ht.af[pop] / 2, an=ht.an[pop] * 2, **{ f'gnomad_{dataset}_{metric.lower()}': gnomads[dataset].freq[gnomad_index_dicts[dataset].get( f'gnomad_{UKB_GNOMAD_POP_MAPPING[pop]}')][metric] for dataset in datasets for metric in ('AC', 'AF', 'AN') }) for pop in POPS ], **{ f'pass_gnomad_{dataset}': hl.len(gnomads[dataset].filters) == 0 for dataset in datasets }) ht.naive_coalesce(1000).write(get_analysis_data_path( 'gnomad_comparison', 'qc', 'full', 'ht'), overwrite=overwrite)
def get_r_within_gene( bm: BlockMatrix, ld_index: hl.Table, gene: str, vep_ht: hl.Table = None, reference_genome: str = None, ): """ Get LD information (`r`) for all pairs of variants within `gene`. Warning: this returns a table quadratic in number of variants. Exercise caution with large genes. :param bm: Input Block Matrix :param ld_index: Corresponding index table :param gene: Gene symbol as string :param vep_ht: Table with VEP annotations (if None, gets from get_gnomad_public_data()) :param reference_genome: Reference genome to pass to get_gene_intervals for fast filtering to gene :return: Table with pairs of variants """ if vep_ht is None: vep_ht = public_release("exomes").ht() if reference_genome is None: reference_genome = hl.default_reference().name intervals = hl.experimental.get_gene_intervals( gene_symbols=[gene], reference_genome=reference_genome) ld_index = hl.filter_intervals(ld_index, intervals) ld_index = ld_index.annotate(vep=vep_ht[ld_index.key].vep) ld_index = ld_index.filter( hl.any(lambda tc: tc.gene_symbol == gene, ld_index.vep.transcript_consequences)) indices_to_keep = ld_index.idx.collect() filt_bm = bm.filter(indices_to_keep, indices_to_keep) ht = filt_bm.entries() ld_index = ld_index.add_index("new_idx").key_by("new_idx") return ht.transmute(r=ht.entry, i_variant=ld_index[ht.i], j_variant=ld_index[ht.j])
def annotate_unphased_pairs(unphased_ht: hl.Table, n_variant_pairs: int, least_consequence: str, max_af: float): # unphased_ht = vp_ht.filter(hl.is_missing(vp_ht.all_phase)) # unphased_ht = unphased_ht.key_by() # Explode variant pairs unphased_ht = unphased_ht.annotate(las=[ hl.tuple([unphased_ht.locus1, unphased_ht.alleles1]), hl.tuple([unphased_ht.locus2, unphased_ht.alleles2]) ]).explode('las', name='la') unphased_ht = unphased_ht.key_by( locus=unphased_ht.la[0], alleles=unphased_ht.la[1]).persist( ) # .checkpoint('gs://gnomad-tmp/vp_ht_unphased.ht') # Annotate single variants with gnomAD freq gnomad_ht = gnomad.public_release('exomes').ht() gnomad_ht = gnomad_ht.semi_join(unphased_ht).repartition( ceil(n_variant_pairs / 10000), shuffle=True).persist() missing_freq = hl.struct( AC=0, AF=0, AN=125748 * 2, # set to no missing for now homozygote_count=0) logger.info( f"{gnomad_ht.count()}/{unphased_ht.count()} single variants from the unphased pairs found in gnomAD." ) gnomad_indexed = gnomad_ht[unphased_ht.key] gnomad_freq = gnomad_indexed.freq unphased_ht = unphased_ht.annotate( adj_freq=hl.or_else(gnomad_freq[0], missing_freq), raw_freq=hl.or_else(gnomad_freq[1], missing_freq), vep_genes=vep_genes_expr(gnomad_indexed.vep, least_consequence), max_af_filter=gnomad_indexed.freq[0].AF <= max_af # pop_max_freq=hl.or_else( # gnomad_exomes.popmax[0], # missing_freq.annotate( # pop=hl.null(hl.tstr) # ) # ) ) unphased_ht = unphased_ht.persist() # unphased_ht = unphased_ht.checkpoint('gs://gnomad-tmp/unphased_ann.ht', overwrite=True) loci_expr = hl.sorted( hl.agg.collect( hl.tuple([ unphased_ht.locus, hl.struct( adj_freq=unphased_ht.adj_freq, raw_freq=unphased_ht.raw_freq, # pop_max_freq=unphased_ht.pop_max_freq ) ])), lambda x: x[0] # sort by locus ).map(lambda x: x[1] # get rid of locus ) vp_freq_expr = hl.struct(v1=loci_expr[0], v2=loci_expr[1]) # [AABB, AABb, AAbb, AaBB, AaBb, Aabb, aaBB, aaBb, aabb] def get_gt_counts(freq: str): return hl.array([ hl.min(vp_freq_expr.v1[freq].AN, vp_freq_expr.v2[freq].AN), # AABB vp_freq_expr.v2[freq].AC - (2 * vp_freq_expr.v2[freq].homozygote_count), # AABb vp_freq_expr.v2[freq].homozygote_count, # AAbb vp_freq_expr.v1[freq].AC - (2 * vp_freq_expr.v1[freq].homozygote_count), # AaBB 0, # AaBb 0, # Aabb vp_freq_expr.v1[freq].homozygote_count, # aaBB 0, # aaBb 0 # aabb ]) gt_counts_raw_expr = get_gt_counts('raw_freq') gt_counts_adj_expr = get_gt_counts('adj_freq') # gt_counts_pop_max_expr = get_gt_counts('pop_max_freq') unphased_ht = unphased_ht.group_by( unphased_ht.locus1, unphased_ht.alleles1, unphased_ht.locus2, unphased_ht.alleles2 ).aggregate( pop='all', # TODO Add option for multiple pops? phase_info=hl.struct(gt_counts=hl.struct(raw=gt_counts_raw_expr, adj=gt_counts_adj_expr), em=hl.struct( raw=get_em_expr(gt_counts_raw_expr), adj=get_em_expr(gt_counts_raw_expr))), vep_genes=hl.agg.collect( unphased_ht.vep_genes).filter(lambda x: hl.len(x) > 0), max_af_filter=hl.agg.all(unphased_ht.max_af_filter) # pop_max_gt_counts_adj=gt_counts_raw_expr, # pop_max_em_p_chet_adj=get_em_expr(gt_counts_raw_expr).p_chet, ) # .key_by() unphased_ht = unphased_ht.transmute( vep_filter=(hl.len(unphased_ht.vep_genes) > 1) & (hl.len(unphased_ht.vep_genes[0].intersection( unphased_ht.vep_genes[1])) > 0)) max_af_filtered, vep_filtered = unphased_ht.aggregate([ hl.agg.count_where(~unphased_ht.max_af_filter), hl.agg.count_where(~unphased_ht.vep_filter) ]) if max_af_filtered > 0: logger.info( f"{max_af_filtered} variant-pairs excluded because the AF of at least one variant was > {max_af}" ) if vep_filtered > 0: logger.info( f"{vep_filtered} variant-pairs excluded because the variants were not found within the same gene with a csq of at least {least_consequence}" ) unphased_ht = unphased_ht.filter(unphased_ht.max_af_filter & unphased_ht.vep_filter) return unphased_ht.drop('max_af_filter', 'vep_filter')