def main(args): hl.init() data_type = 'genomes' if args.genomes else 'exomes' if args.write_hardcalls: mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None) ht = hl.read_table(qc_ht_path(data_type, 'hard_filters')) mt = annotate_adj( mt.select_cols(sex=ht[hl.literal(data_type), mt.s].sex)) mt = mt.select_entries(GT=hl.case(missing_false=True).when( hl.call(mt.PGT[0], mt.PGT[1]) == mt.GT, mt.PGT).default(mt.GT), PID=mt.PID, adj=mt.adj) mt = adjust_sex_ploidy(mt, mt.sex) mt = mt.select_cols().naive_coalesce(10000) mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=False), args.overwrite) if args.split_hardcalls: mt = get_gnomad_data(data_type, split=False, meta_root=None) mt = hl.split_multi_hts(mt) mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=True), args.overwrite) if args.write_nonrefs: # CPU-hours: 600 (E) mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None).select_cols() mt = mt.annotate_entries(is_missing=hl.is_missing(mt.GT)) mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref()) mt = annotate_adj(mt) if args.exomes: mt = mt.naive_coalesce(10000) mt.write( get_gnomad_data_path(data_type, split=False, non_refs_only=True), args.overwrite) if args.split_nonrefs: # CPU-hours: 300 (E) mt = get_gnomad_data(data_type, split=False, non_refs_only=True) mt = hl.split_multi_hts(mt) mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref()) mt.write( get_gnomad_data_path(data_type, split=True, non_refs_only=True), args.overwrite)
def main(args): hl.init(log='/platform_pca.log') if not args.skip_prepare_data_for_platform_pca: # ~1 hour on 800 cores (3/8/18) logger.info('Preparing data for platform PCA...') mt = get_gnomad_data('exomes', adj=True, raw=False, meta_root=None, fam_root=None, split=False) mt = filter_to_autosomes(mt) intervals = hl.import_locus_intervals(evaluation_intervals_path) mt = mt.annotate_rows(interval=intervals[mt.locus].target) mt = mt.filter_rows( hl.is_defined(mt.interval) & (hl.len(mt.alleles) == 2)) mt = mt.select_entries( GT=hl.or_missing(hl.is_defined(mt.GT), hl.struct())) callrate_mt = mt.group_rows_by(mt.interval).aggregate( callrate=hl.agg.fraction(hl.is_defined(mt.GT))) callrate_mt.write(exome_callrate_mt_path, args.overwrite) if not args.skip_run_platform_pca: logger.info('Running platform PCA...') qc_ht = hl.read_table(qc_ht_path('exomes', 'hard_filters')).key_by('s') callrate_mt = hl.read_matrix_table(exome_callrate_mt_path) callrate_mt = callrate_mt.filter_cols( hl.len(qc_ht[callrate_mt.col_key].hard_filters) == 0) callrate_mt = callrate_mt.annotate_entries(callrate=hl.int( callrate_mt.callrate > 0.25)) # Center until Hail's PCA does it for you callrate_mt = callrate_mt.annotate_rows( mean_callrate=hl.agg.mean(callrate_mt.callrate)) callrate_mt = callrate_mt.annotate_entries( callrate=callrate_mt.callrate - callrate_mt.mean_callrate) eigenvalues, scores, _ = hl.pca(callrate_mt.callrate, compute_loadings=False) logger.info('Eigenvalues: {}'.format(eigenvalues)) # [731282566.2824697, 78687228.90071851, 43837650.51729764, 33969298.61827205, 26308703.539534636, 21102437.512725923, 16949828.555817757, 12994894.187041137, 8372332.274295175, 8128326.814388647] scores.write(exome_callrate_scores_ht_path) logger.info( 'Annotating with platform PCs and known platform annotations...') scores = hl.read_table(exome_callrate_scores_ht_path).annotate( data_type='exomes') if args.pc_scores_in_separate_fields: scores = scores.transmute(scores=[ scores[ann] for ann in sorted( [ann for ann in scores.row if ann.startswith("PC")], key=lambda x: int(x[2:])) ]) platform_pcs = assign_platform_pcs(scores) platform_pcs.write(qc_ht_path('exomes', 'platforms'), overwrite=args.overwrite)
def get_adj_missing_mt(data_type: str, pbt: bool) -> hl.MatrixTable: mt = get_gnomad_data(data_type).select_cols() if not pbt else hl.read_matrix_table(pbt_phased_trios_mt_path(data_type)) mt = mt.select_rows() mt = mt.select_entries( GT=hl.or_missing(mt.GT.is_non_ref(), mt.GT), missing=hl.is_missing(mt.GT), adj=mt.adj ).select_cols().select_rows() if pbt: mt = mt.key_cols_by('s', trio_id=mt.source_trio.id) mt = extract_pbt_probands(mt, data_type) mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref())) mt = mt.key_cols_by(s=mt.s, trio_id=mt.source_trio.id) else: meta = get_gnomad_meta('exomes') mt = mt.filter_cols(meta[mt.col_key].high_quality) return mt
def main(args): hl.init() data_type = "genomes" if args.genomes else "exomes" if not args.skip_write_qc_mt: logger.info("Importing data...") # 1h40 for exomes, 3h20 for genomes mt = get_gnomad_data( data_type, raw=True, split=False ) # NOTE: using full calls since hardcalls doesn't exist at this stage logger.info( "Filtering to bi-allelic, high-callrate, common SNPs for sample QC..." ) mt = mt.filter_rows((hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)) mt.annotate_cols(callrate=hl.agg.fraction(hl.is_defined( mt.GT))).naive_coalesce(5000).write(qc_mt_path(data_type), overwrite=args.overwrite) qc_mt = hl.read_matrix_table(qc_mt_path(data_type)) logger.info("Importing metadata...") meta_ht = hl.import_table(qc_meta_path(data_type), impute=True, types={ 'age': hl.tfloat64 }).key_by('s') qc_mt = qc_mt.annotate_cols(**meta_ht[qc_mt.s]) logger.info("Inferring sex...") qc_ht = annotate_sex(qc_mt, qc_temp_data_prefix(data_type), male_threshold=0.8 if args.genomes else 0.6).cols() # Flag Klinefelter's individuals and samples with sex aneuploidies if args.exomes: qc_ht = qc_ht.annotate( ambiguous_sex=((qc_ht.f_stat >= 0.5) & (hl.is_defined(qc_ht.normalized_y_coverage) & (qc_ht.normalized_y_coverage <= 0.1))) | (hl.is_missing(qc_ht.f_stat)) | ((qc_ht.f_stat >= 0.4) & (qc_ht.f_stat <= 0.6) & (hl.is_defined(qc_ht.normalized_y_coverage) & (qc_ht.normalized_y_coverage > 0.1))), sex_aneuploidy=(qc_ht.f_stat < 0.4) & hl.is_defined(qc_ht.normalized_y_coverage) & (qc_ht.normalized_y_coverage > 0.1)) else: qc_ht = qc_ht.annotate(ambiguous_sex=hl.is_missing(qc_ht.is_female)) logger.info("Annotating samples failing hard filters...") if args.exomes: sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when( qc_ht.sex_aneuploidy, "sex_aneuploidy").when(qc_ht.is_female, "female").default("male")) else: sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when( qc_ht.is_female, "female").default("male")) qc_ht = qc_ht.annotate( hard_filters=make_hard_filters_expr(qc_ht, data_type), perm_filters=make_perm_filters_expr(qc_ht, data_type), sex=sex_expr, data_type=data_type).key_by('data_type', 's') qc_ht.write(qc_ht_path(data_type, 'hard_filters'), overwrite=args.overwrite) # Export annotations to make rank list for relatedness (in final sample QC) if args.exomes: colnames = ['internal', 'project_id', 'pct_bases_20x', 'perm_filters'] else: colnames = ['pcr_free', 'mean_dp', 'perm_filters'] rank_ht = qc_ht.filter(hl.len(qc_ht.hard_filters) == 0, keep=True).select(*colnames) (rank_ht.annotate(releasable=( hl.len(rank_ht.perm_filters) == 0)).drop('perm_filters').export( rank_annotations_path(data_type))) # Check numbers: qc_ht = hl.read_table(qc_ht_path(data_type, 'hard_filters')) sample_count = qc_ht.count() checkpoint1a = qc_ht.aggregate( hl.agg.count_where(hl.len(qc_ht['hard_filters']) == 0)) checkpoint1b = qc_ht.aggregate( hl.agg.count_where((hl.len(qc_ht['hard_filters']) == 0) & (hl.len(qc_ht.perm_filters) == 0))) logger.info('{} samples found before filtering'.format(sample_count)) logger.info('{} samples found after checkpoint 1a (hard filters)'.format( checkpoint1a)) logger.info( '{} samples found after checkpoint 1b (hard filters + permissions)'. format(checkpoint1b))
def create_meta(related_data: GnomADRelatedData, fake_fam_prop: float, old_version: str, overwrite: bool) -> None: """ Creates and writes a dataframe with metadata to evaluate gnomAD trios from the raw ped file. In order to compare the raw ped, metadata is also generated for: 1) A number of fake families are generated 2) The previous iteration of the ped file (old_version) :param GnomADRelatedData related_data: Input data :param float fake_fam_prop: Number of fake trios to generate as a proportion of the number of real families in the data :param str old_version: Version of previous iteration to load :param bool overwrite: Whether to overwrite previous data :return: Nothing :rtype: None """ raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type), delimiter="\\t") n_fake_trios = int(fake_fam_prop * len(raw_ped.complete_trios())) logger.info( f"Generating fake pedigree with {n_fake_trios} trios for {related_data.data_type}" ) fake_fams = create_fake_pedigree(n_fake_trios, list(related_data.meta_pd.s), raw_ped) fake_fams.write(fake_fam_path(related_data.data_type)) logger.info(f"Running mendel_errors on {related_data.data_type}") # Run mendel errors on families made of random samples to establish expectation in non-trios: pedigrees = [('new', raw_ped), ('old', hl.Pedigree.read(fam_path(related_data.data_type, version=old_version), delimiter="\\t")), ('fake', hl.Pedigree.read(fake_fam_path(related_data.data_type), delimiter="\\t"))] ped_pd = merge_pedigree_pandas([(name, ped_to_pandas(ped)) for name, ped in pedigrees], related_data.sample_to_dups, True) # Run mendel_errors all_ped = pandas_to_ped(ped_pd) gnomad = get_gnomad_data(related_data.data_type) fam_samples = hl.literal({ s for trio in all_ped.trios for s in [trio.s, trio.mat_id, trio.pat_id] }) gnomad = gnomad.filter_cols(fam_samples.contains(gnomad.s)) all_errors, per_fam, per_sample, _ = hl.mendel_errors( gnomad['GT'], all_ped) all_errors.write(sample_qc_mendel_ht_path(related_data.data_type, "all_errors"), overwrite=overwrite) per_fam.write(sample_qc_mendel_ht_path(related_data.data_type, "per_fam"), overwrite=overwrite) per_sample.write(sample_qc_mendel_ht_path(related_data.data_type, "per_sample"), overwrite=overwrite) # Merge all metadata ped_pd = add_pedigree_meta(ped_pd=ped_pd, meta_pd=related_data.meta_pd, kin_ht=related_data.kin_ht, mendel_per_sample_ht=per_sample) # Write merged pedigrees as HT sql_context = SQLContext(hl.spark_context()) hl.Table.from_spark(sql_context.createDataFrame(ped_pd)).write( merged_pedigrees_ht_path(related_data.data_type), overwrite=overwrite)
def main(args): data_type = 'exomes' if args.exomes else 'genomes' if args.pbt_tm: mt = get_gnomad_data(data_type, split=False) meta = mt.cols() hq_samples = meta.aggregate( hl.agg.filter(meta.meta.high_quality, hl.agg.collect(meta.s))) ped = hl.Pedigree.read(fam_path(data_type), delimiter='\\t').filter_to(hq_samples) ped_samples = hl.literal( set([ s for trio in ped.complete_trios() for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = mt.select_cols().select_rows() mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref())) tm = hl.trio_matrix(mt, ped, complete_trios=True) tm = hl.experimental.phase_trio_matrix_by_transmission(tm) tm.write(pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True), overwrite=args.overwrite) if args.pbt_explode: tm = hl.read_matrix_table( pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True)) tm = tm.annotate_entries(trio_adj=tm.proband_entry.adj & tm.father_entry.adj & tm.mother_entry.adj) pmt = explode_trio_matrix(tm, keep_trio_entries=True) pmt = pmt.transmute_entries(trio_adj=pmt.source_trio_entry.trio_adj) pmt.write(pbt_phased_trios_mt_path(data_type, split=False), overwrite=args.overwrite) pmt = hl.read_matrix_table( pbt_phased_trios_mt_path(data_type, split=False)) pmt = pmt.rename({'PBT_GT': 'PGT'}) # ugly but supported by hl.split_multi_hts pmt = hl.split_multi_hts(pmt) pmt = pmt.rename({'PGT': 'PBT_GT'}) pmt.write(pbt_phased_trios_mt_path(data_type), overwrite=args.overwrite) if args.phase_multi_families: pbt = hl.read_matrix_table(pbt_phased_trios_mt_path(data_type)) # Keep samples that: # 1. There are more than one entry in the Matrix (i.e. they are part of multiple trios) # 2. In all their entries, the parents are the same (there are only two exceptions to this, so best to ignore these and focus on parents/multi-offspring families) nt_samples = pbt.cols() nt_samples = nt_samples.group_by('s').aggregate( trios=hl.agg.collect(nt_samples.source_trio)) nt_samples = nt_samples.filter( (hl.len(nt_samples.trios) > 1) & nt_samples.trios[1:].any(lambda x: (x.mother.s != nt_samples.trios[ 0].mother.s) | (x.father.s != nt_samples.trios[0].father.s)), keep=False) pbt = pbt.filter_cols(hl.is_defined(nt_samples[pbt.col_key])) # Group cols for these samples, keeping all GTs in an array # Compute the consensus GT (incl. phase) + QC metrics based on (a) phased genotypes have priority, (b) genotypes with most votes pbt = pbt.group_cols_by('s').aggregate(PBT_GTs=hl.agg.filter( hl.is_defined(pbt.GT), hl.agg.collect(pbt.GT))) gt_counter = hl.sorted(hl.array( pbt.PBT_GTs.group_by(lambda x: x).map_values(lambda x: hl.len(x))), key=lambda x: x[0].phased * 100 + x[1], reverse=True) phased_gt_counts = gt_counter.filter(lambda x: x[0].phased).map( lambda x: x[1]) pbt = pbt.annotate_entries( consensus_gt=gt_counter.map(lambda x: x[0]).find(lambda x: True), phase_concordance=phased_gt_counts.find(lambda x: True) / hl.sum(phased_gt_counts), discordant_gts=hl.len( hl.set( pbt.PBT_GTs.map(lambda x: hl.cond( x.phased, hl.call(x[0], x[1]), x)))) > 1) pbt.write('gs://gnomad/projects/compound_hets/pbt_multi_families.mt')
def main(args): hl.init(log="/tmp/hail_comp_vp.log") data_type = 'exomes' if args.exomes else 'genomes' if args.create_full_vp: logger.info( f"Generating gnomAD VP MT for PBT VPs, excluding PBT samples.") # Load PBT VP MT pbt_vp_mt = hl.read_matrix_table( full_mt_path(data_type, True, args.least_consequence, args.max_freq, args.chrom)) # Get all PBT trio ids pbt_samples = get_pbt_trio_ht(data_type).key_by('s') mt = get_gnomad_data(data_type) mt = mt.select_entries(GT=hl.or_missing(mt.GT.is_non_ref(), mt.GT), PID=mt.PID, missing=hl.is_missing(mt.GT), adj=mt.adj).select_cols().select_rows() meta = get_gnomad_meta('exomes') mt = mt.filter_cols(meta[mt.col_key].release & hl.is_missing(pbt_samples[mt.col_key])) vp_mt = create_full_vp(mt, vp_list_ht=pbt_vp_mt.rows(), data_type=data_type) vp_mt = vp_mt.checkpoint(pbt_comparison_full_mt_path( data_type=data_type, least_consequence=args.least_consequence, max_freq=args.max_freq, chrom=args.chrom), overwrite=args.overwrite) logger.info("Total sample count after PBT filtering: %d", vp_mt.count_cols()) if args.create_vp_summary: logger.info("Creating VP summary") mt = hl.read_matrix_table( pbt_comparison_full_mt_path( data_type=data_type, least_consequence=args.least_consequence, max_freq=args.max_freq, chrom=args.chrom)) meta = get_gnomad_meta(data_type).select('pop', 'release') mt = mt.annotate_cols(**meta[mt.col_key]) ht = create_vp_summary(mt) ht = ht.checkpoint(pbt_comparison_vp_count_ht_path( data_type=data_type, least_consequence=args.least_consequence, max_freq=args.max_freq, chrom=args.chrom), overwrite=args.overwrite, _read_if_exists=not args.overwrite) logger.info("Phasing VP summary") ht = get_phased_gnomad_ht(ht) ht.write(pbt_comparison_phased_vp_count_ht_path( data_type=data_type, least_consequence=args.least_consequence, max_freq=args.max_freq, chrom=args.chrom), overwrite=args.overwrite) if args.export: data_type = 'exomes' pbt = hl.read_table(pbt_phase_count_ht_path(data_type, pbt=True)) pbt_ann = hl.read_table(vp_ann_ht_path(data_type, pbt=True)) pbt = pbt.annotate(**pbt_ann[pbt.locus1, pbt.alleles1, pbt.locus2, pbt.alleles2], distance=pbt.locus2.position - pbt.locus1.position) discordant_within_pop_expr = hl.array(pbt.phase_by_pop).any(lambda x: ( x[0] != 'all') & (x[1].adj.n_same_hap > 0) & (x[1].adj.n_chet > 0)) pbt = pbt.annotate(phase_by_pop=hl.array(pbt.phase_by_pop), discordant_within_pop=discordant_within_pop_expr, discordant_between_pops=~discordant_within_pop_expr & (pbt.phase_by_pop['all'].adj.n_same_hap > 0) & (pbt.phase_by_pop['all'].adj.n_chet > 0), **pbt_ann[pbt.key], distance=pbt.locus2.position - pbt.locus1.position) pbt = pbt.explode('phase_by_pop') pbt = pbt.transmute(pop=pbt.phase_by_pop[0], **pbt.phase_by_pop[1]) # Drop RF-filtered sites pbt = pbt.filter((hl.len(pbt.filters1) == 0) & (hl.len(pbt.filters2) == 0)) # Filter sites that are in LCR / Decoy pbt = pbt.filter(pbt.lcr1 | pbt.lcr2 | pbt.decoy1 | pbt.decoy2 | pbt.segdup1 | pbt.segdup2, keep=False) # Drop sites with inconsistent trio phasing? pbt = pbt.filter(~pbt.discordant_within_pop) # pbt = pbt.filter(pbt.adj.n_same_hap + pbt.adj.n_chet > 0) # Drop sites that are too frequent in a given pop pbt = pbt.filter((pbt.freq1[pbt.pop].AF <= 0.05) & (pbt.freq2[pbt.pop].AF <= 0.05)) # Drop sites that really are het non-ref pbt = pbt.filter(pbt.distance > 0) # filter to autosomes autosomes = hl.parse_locus_interval('1-22') pbt = pbt.filter(autosomes.contains(pbt.locus1)) phase_ht = hl.read_table( pbt_comparison_phased_vp_count_ht_path( data_type=data_type, least_consequence=args.least_consequence, max_freq=args.max_freq, chrom=args.chrom)) pbt = pbt.annotate(trio_chet=hl.struct( raw=hl.case().when( (pbt.raw.n_same_hap > 0) & (pbt.raw.n_chet == 0), False).when( (pbt.raw.n_same_hap == 0) & (pbt.raw.n_chet > 0), True).or_missing(), adj=hl.case().when( (pbt.adj.n_same_hap > 0) & (pbt.adj.n_chet == 0), False).when( (pbt.adj.n_same_hap == 0) & (pbt.adj.n_chet > 0), True).or_missing()), **phase_ht[pbt.locus1, pbt.alleles1, pbt.locus2, pbt.alleles2].phase_info[pbt.pop]) pbt = pbt.filter(~hl.is_nan(pbt.gt_counts.adj[0]) & (pbt.pop != 'oth')).key_by() rf_features = { 'snv1': pbt.snv1, 'snv2': pbt.snv2, 'cpg1': hl.or_else(pbt.cpg1, False), 'cpg2': hl.or_else(pbt.cpg2, False), 'distance': pbt.distance } rf_features.update({ f'n{i}{j}': pbt.gt_counts.adj[(3 * i) + j] for i in [0, 1, 2] for j in [0, 1, 2] }) ac1 = get_ac_from_gt_counts(pbt.gt_counts.adj, True) ac2 = get_ac_from_gt_counts(pbt.gt_counts.adj, False) an = 2 * hl.sum(pbt.gt_counts.adj) pbt_df = pbt.select( locus1=pbt.locus1, ref1=pbt.alleles1[0], alt1=pbt.alleles1[1], locus2=pbt.locus2, ref2=pbt.alleles2[0], alt2=pbt.alleles2[1], pop=pbt.pop, trio_chet=pbt.trio_chet.adj, em=pbt.em.adj.p_chet, singlet_het_ratio=pbt.singlet_het_ratio.adj, lr=pbt.likelihood_model.adj, AC1=ac1, AC2=ac2, AF1=ac1 / an, AF2=ac2 / an, n_var_gnomad=(ac1 > 0) + (ac2 > 0), discordant_between_pops=pbt.discordant_between_pops, discordant_within_pop=pbt.discordant_within_pop, **rf_features ).flatten().to_pandas( ) # NOTE: The serialization to pandas happens because this code comes from a notebook initially with hl.utils.hadoop_open( 'gs://gnomad/projects/compound_hets/pbt_annotated.csv', 'w') as f: pbt_df.to_csv(f)
def main(args): hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/') if not args.load_joint_pruned_qc_mt: logger.info('Joining exomes and genomes...') exome_qc_mt = read_and_pre_process_data( qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters')) genome_qc_mt = read_and_pre_process_data( qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters')) joint_qc_mt = exome_qc_mt.union_cols( genome_qc_mt) # NOTE: this is an inner join on rows joint_qc_mt = joint_qc_mt.filter_rows( (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99)) joint_qc_mt.write(qc_mt_path('joint'), args.overwrite) logger.info('LD-pruning joint mt of exomes and genomes...') joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint')) variants, samples = joint_qc_mt.count() logger.info('Pruning {0} variants in {1} samples'.format( variants, samples)) joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1) # Note writing the LD-pruned MT is probably overkill # vs using `filter_rows` to filter sites based on the LD-pruned HT. joint_qc_pruned_mt = joint_qc_mt.filter_rows( hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key])) joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True), args.overwrite) pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True)) variants, samples = pruned_mt.count() logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format( samples, variants)) if not args.skip_pc_relate: logger.info('Running PCA for PC-Relate...') eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write( qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht', args.overwrite) logger.info('Running PC-Relate...') scores = hl.read_table( qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht') # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes relatedness_ht = hl.pc_relate( pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') relatedness_ht.write(relatedness_ht_path, args.overwrite) relatedness_ht = hl.read_table(relatedness_ht_path) if not args.skip_relatedness: infer_ped(GnomADRelatedData('exomes')) infer_ped(GnomADRelatedData('genomes')) logger.info('Making rank file...') rank_table = make_rank_file(rank_annotations_path('joint')) logger.info('Finished making rank file...') related_samples_to_drop_ranked = get_related_samples_to_drop( rank_table, relatedness_ht) related_samples_to_drop_ranked.write( qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht', args.overwrite) pca_mt, related_mt = split_mt_by_relatedness(pruned_mt) if not args.skip_pop_pca: variants, samples = pca_mt.count() logger.info('{} samples after removing relateds'.format(samples)) # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' + pca_mt.s.replace(" ", "_")).replace( "/", "_").key_cols_by('uid') hl.export_plink(plink_mt, qc_temp_data_prefix('joint') + '.unrelated.plink', fam_id=plink_mt.uid, ind_id=plink_mt.uid) logger.info( 'Computing population PCs and annotating with known population labels...' ) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=20, compute_loadings=True) pca_af_ht = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate( pca_af=pca_af_ht[pca_loadings.key].pca_af) pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite) pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite) pca_scores = hl.read_table(ancestry_pca_scores_ht_path()) pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path()) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() logger.info( 'Projecting population PCs for {} related samples...'.format(samples)) related_scores = pc_project(related_mt, pca_loadings) relateds = related_mt.cols() relateds = relateds.annotate(scores=related_scores[relateds.key].scores) logger.info('Assigning population annotations...') pop_colnames = ['related', 'known_pop', 'scores'] pop_annots_ht = hl.import_table(known_population_annotations, impute=True).key_by('combined_sample') joint_ht = pca_mt.cols().union(relateds) joint_ht = joint_ht.annotate( known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' + joint_ht.s.replace(' ', '_')].known_pop ) # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed joint_pca_ht = joint_ht.select(*pop_colnames) joint_pca_ht, joint_pca_fit = run_assign_population_pcs( joint_pca_ht, qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz', qc_temp_data_prefix('joint') + '.RF_fit.pkl', pcs=list(range(1, 7))) joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select( 'pop', *pop_colnames) # Add special Estonian pop category for genomes estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate( data_type='genomes').key_by('data_type', 'sample')) joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch) joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when( hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1' ).when(hl.is_defined(joint_ht.pop) & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist() # These are keyed by only `s` genome_mt = get_gnomad_data('genomes', adj=False, split=False, meta_root=None).select_cols() exome_mt = get_gnomad_data('exomes', adj=False, split=False, meta_root=None).select_cols() # Population-specific filtering if not args.skip_calculate_sample_metrics: logger.info( 'Running mini sample QC for platform- and population-specific filtering...' ) gnomad_sample_qc(exome_mt).cols().select('sample_qc').write( qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite) gnomad_sample_qc(genome_mt).cols().select('sample_qc').write( qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite) # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet logger.info('Annotating population and platform assignments...') platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms')) exome_ht = exome_mt.cols() exome_ht = exome_ht.annotate( qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform, **joint_ht.filter( joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s]) genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters')) genome_ht = genome_mt.cols() genome_ht = genome_ht.annotate( qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform, **joint_ht.filter( joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s]) exome_sample_qc_ht = hl.read_table( qc_temp_data_prefix('exomes') + '.sample_qc.ht') genome_sample_qc_ht = hl.read_table( qc_temp_data_prefix('genomes') + '.sample_qc.ht') exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s]) genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s]) # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev logger.info( 'Calculating platform- and population-specific sample QC thresholds...' ) exome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] exome_pop_platform_filter_ht = compute_stratified_metrics_filter( exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform']) exome_ht = exome_ht.annotate_globals( hl.eval(exome_pop_platform_filter_ht.globals)) exome_ht = exome_ht.annotate( **exome_pop_platform_filter_ht[exome_ht.key]).persist() genome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] genome_pop_platform_filter_ht = compute_stratified_metrics_filter( genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform']) genome_ht = genome_ht.annotate_globals( hl.eval(genome_pop_platform_filter_ht.globals)) genome_ht = genome_ht.annotate( **genome_pop_platform_filter_ht[genome_ht.key]).persist() # Annotate samples that fail their respective filters checkpoint = exome_ht.aggregate( hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0)) logger.info( f'{checkpoint} exome samples found passing pop/platform-specific filtering' ) exome_ht.key_by(data_type='exomes', s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'), args.overwrite) checkpoint = genome_ht.aggregate( hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0)) logger.info( f'{checkpoint} genome samples found passing pop/platform-specific filtering' ) genome_ht.key_by(data_type='genomes', s=genome_ht.s).write( qc_ht_path('genomes', 'pop_platform'), args.overwrite)
def compute_from_full_mt(chr20: bool, overwrite: bool): mt = get_gnomad_data('exomes', adj=True, release_samples=True) freq_ht = hl.read_table(annotations_ht_path('exomes', 'frequencies')) vep_ht = hl.read_table(annotations_ht_path('exomes', 'vep')) rf_ht = hl.read_table(annotations_ht_path('exomes', 'rf')) if chr20: mt, freq_ht, vep_ht, rf_ht = filter_to_chr20([mt, freq_ht, vep_ht, rf_ht]) vep_ht = vep_ht.annotate( vep=get_worst_gene_csq_code_expr(vep_ht.vep).values() ) freq_ht = freq_ht.select( freq=freq_ht.freq[:10], popmax=freq_ht.popmax ) freq_meta = hl.eval(freq_ht.globals.freq_meta) freq_dict = {f['pop']: i for i, f in enumerate(freq_meta[:10]) if 'pop' in f} freq_dict['all'] = 0 freq_dict = hl.literal(freq_dict) mt = mt.annotate_rows( **freq_ht[mt.row_key], vep=vep_ht[mt.row_key].vep, filters=rf_ht[mt.row_key].filters ) mt = mt.filter_rows( (mt.freq[0].AF <= MAX_FREQ) & (hl.len(mt.vep) > 0) & (hl.len(mt.filters) == 0) ) mt = mt.filter_entries(mt.GT.is_non_ref()) mt = mt.select_entries( is_het=mt.GT.is_het() ) mt = mt.explode_rows(mt.vep) mt = mt.transmute_rows(**mt.vep) mt = mt.annotate_cols( pop=['all', mt.meta.pop] ) mt = mt.explode_cols(mt.pop) mt = mt.group_rows_by( 'gene_id' ).aggregate_rows( gene_symbol=hl.agg.take(mt.gene_symbol, 1)[0] ).aggregate( counts=hl.agg.filter( hl.if_else( mt.pop == 'all', hl.is_defined(mt.popmax) & (mt.popmax.AF <= MAX_FREQ), mt.freq[freq_dict[mt.pop]].AF <= MAX_FREQ ), hl.agg.group_by( hl.if_else( mt.pop == 'all', mt.popmax.AF > 0.001, mt.freq[freq_dict[mt.pop]].AF > 0.001 ), hl.struct( hom_csq=hl.agg.filter(~mt.is_het, hl.agg.min(mt.csq)), het_csq=hl.agg.filter(mt.is_het, hl.agg.min(mt.csq)), het_het_csq=hl.sorted( hl.array( hl.agg.filter(mt.is_het, hl.agg.counter(mt.csq)) ), key=lambda x: x[0] ).scan( lambda i, j: (j[0], i[1] + j[1]), (0, 0) ).find( lambda x: x[1] > 1 )[0] ) ) ) ) mt = mt.annotate_entries( counts=hl.struct( all=hl.struct( hom_csq=hl.min(mt.counts.get(True).hom_csq, mt.counts.get(False).hom_csq), het_csq=hl.min(mt.counts.get(True).het_csq, mt.counts.get(False).het_csq), het_het_csq=hl.min( mt.counts.get(True).het_het_csq, mt.counts.get(False).het_het_csq, hl.or_missing( hl.is_defined(mt.counts.get(True).het_csq) & hl.is_defined(mt.counts.get(False).het_csq), hl.max(mt.counts.get(True).het_csq, mt.counts.get(False).het_csq) ) ), ), af_le_0_001=mt.counts.get(False) ) ) mt = mt.checkpoint('gs://gnomad-tmp/compound_hets/het_and_hom_per_gene{}.1.mt'.format( '.chr20' if chr20 else '' ), overwrite=True) gene_ht = mt.annotate_rows( row_counts=hl.flatten([ hl.array( hl.agg.group_by( mt.pop, hl.struct( csq=csq, af=af, n_hom=hl.agg.count_where(mt.counts[af].hom_csq == csq_i), n_het=hl.agg.count_where(mt.counts[af].het_csq == csq_i), n_het_het=hl.agg.count_where(mt.counts[af].het_het_csq == csq_i) ) ) ).filter( lambda x: (x[1].n_het > 0) | (x[1].n_hom > 0) | (x[1].n_het_het > 0) ).map( lambda x: x[1].annotate( pop=x[0] ) ) for csq_i, csq in enumerate(CSQ_CODES) for af in ['all', 'af_le_0_001'] ]) ).rows() gene_ht = gene_ht.explode('row_counts') gene_ht = gene_ht.select( 'gene_symbol', **gene_ht.row_counts ) gene_ht.describe() gene_ht = gene_ht.checkpoint( 'gs://gnomad-lfran/compound_hets/het_and_hom_per_gene{}.ht'.format( '.chr20' if chr20 else '' ), overwrite=overwrite ) gene_ht.flatten().export('gs://gnomad-lfran/compound_hets/het_and_hom_per_gene{}.tsv.gz'.format( '.chr20' if chr20 else '' ))