def get_pbt_trio_ht(data_type: str): # Keep a single proband from each family with > 1 proband. meta = get_gnomad_meta(data_type) hq_samples = hl.literal( meta.aggregate( hl.agg.filter( meta.high_quality & (meta.project_id != BAD_THAI_TRIOS_PROJECT_ID), hl.agg.collect(meta.s)))) fam_ht = hl.import_fam(fam_path(data_type), delimiter='\\t') fam_ht = fam_ht.filter( hq_samples.contains(fam_ht.id) & hq_samples.contains(fam_ht.pat_id) & hq_samples.contains(fam_ht.mat_id)) fam_ht = fam_ht.key_by('pat_id').distinct() fam_ht = fam_ht.key_by('mat_id').distinct() fam_ht = fam_ht.annotate( s=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]).explode('s') fam_ht = fam_ht.key_by('s', 'id') return fam_ht
def create_ped(related_data: GnomADRelatedData, new_version: str, max_mv_z: int = 3): """ Loads the raw gnomAD ped, applies Mendelian Violations cutoff in order to produce a final ped file and writes final gnomAD ped file. :param GnomADRelatedData related_data: Input data :param str new_version: String containing the new version name to write the data to :param int max_mv_z: Max number of std devs above the mean number of MVs in inferred trios to keep trio. :return: Nothing :rtype: None """ raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type), delimiter="\\t") logger.info( f"Loaded raw {related_data.data_type} pedigree containing {len(raw_ped.trios)} trios" ) # Filter families ped_meta = hl.read_table(merged_pedigrees_ht_path( related_data.data_type)).to_pandas() ped_meta = ped_meta[ped_meta.ped_name.str.contains('new')] mean_errors = np.mean(ped_meta.errors) std_errors = np.std(ped_meta.errors) filtered_s = set( ped_meta[ped_meta.errors > mean_errors + max_mv_z * std_errors].s) # Write final fam file final_ped = hl.Pedigree( [trio for trio in raw_ped.trios if trio.s not in filtered_s]) final_ped.write(fam_path(related_data.data_type, version=new_version)) logger.info( f"Wrote final {related_data.data_type} pedigree with {len(final_ped.trios)} trios." )
def create_meta(related_data: GnomADRelatedData, fake_fam_prop: float, old_version: str, overwrite: bool) -> None: """ Creates and writes a dataframe with metadata to evaluate gnomAD trios from the raw ped file. In order to compare the raw ped, metadata is also generated for: 1) A number of fake families are generated 2) The previous iteration of the ped file (old_version) :param GnomADRelatedData related_data: Input data :param float fake_fam_prop: Number of fake trios to generate as a proportion of the number of real families in the data :param str old_version: Version of previous iteration to load :param bool overwrite: Whether to overwrite previous data :return: Nothing :rtype: None """ raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type), delimiter="\\t") n_fake_trios = int(fake_fam_prop * len(raw_ped.complete_trios())) logger.info( f"Generating fake pedigree with {n_fake_trios} trios for {related_data.data_type}" ) fake_fams = create_fake_pedigree(n_fake_trios, list(related_data.meta_pd.s), raw_ped) fake_fams.write(fake_fam_path(related_data.data_type)) logger.info(f"Running mendel_errors on {related_data.data_type}") # Run mendel errors on families made of random samples to establish expectation in non-trios: pedigrees = [('new', raw_ped), ('old', hl.Pedigree.read(fam_path(related_data.data_type, version=old_version), delimiter="\\t")), ('fake', hl.Pedigree.read(fake_fam_path(related_data.data_type), delimiter="\\t"))] ped_pd = merge_pedigree_pandas([(name, ped_to_pandas(ped)) for name, ped in pedigrees], related_data.sample_to_dups, True) # Run mendel_errors all_ped = pandas_to_ped(ped_pd) gnomad = get_gnomad_data(related_data.data_type) fam_samples = hl.literal({ s for trio in all_ped.trios for s in [trio.s, trio.mat_id, trio.pat_id] }) gnomad = gnomad.filter_cols(fam_samples.contains(gnomad.s)) all_errors, per_fam, per_sample, _ = hl.mendel_errors( gnomad['GT'], all_ped) all_errors.write(sample_qc_mendel_ht_path(related_data.data_type, "all_errors"), overwrite=overwrite) per_fam.write(sample_qc_mendel_ht_path(related_data.data_type, "per_fam"), overwrite=overwrite) per_sample.write(sample_qc_mendel_ht_path(related_data.data_type, "per_sample"), overwrite=overwrite) # Merge all metadata ped_pd = add_pedigree_meta(ped_pd=ped_pd, meta_pd=related_data.meta_pd, kin_ht=related_data.kin_ht, mendel_per_sample_ht=per_sample) # Write merged pedigrees as HT sql_context = SQLContext(hl.spark_context()) hl.Table.from_spark(sql_context.createDataFrame(ped_pd)).write( merged_pedigrees_ht_path(related_data.data_type), overwrite=overwrite)
def main(args): data_type = 'exomes' if args.exomes else 'genomes' if args.pbt_tm: mt = get_gnomad_data(data_type, split=False) meta = mt.cols() hq_samples = meta.aggregate( hl.agg.filter(meta.meta.high_quality, hl.agg.collect(meta.s))) ped = hl.Pedigree.read(fam_path(data_type), delimiter='\\t').filter_to(hq_samples) ped_samples = hl.literal( set([ s for trio in ped.complete_trios() for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = mt.select_cols().select_rows() mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref())) tm = hl.trio_matrix(mt, ped, complete_trios=True) tm = hl.experimental.phase_trio_matrix_by_transmission(tm) tm.write(pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True), overwrite=args.overwrite) if args.pbt_explode: tm = hl.read_matrix_table( pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True)) tm = tm.annotate_entries(trio_adj=tm.proband_entry.adj & tm.father_entry.adj & tm.mother_entry.adj) pmt = explode_trio_matrix(tm, keep_trio_entries=True) pmt = pmt.transmute_entries(trio_adj=pmt.source_trio_entry.trio_adj) pmt.write(pbt_phased_trios_mt_path(data_type, split=False), overwrite=args.overwrite) pmt = hl.read_matrix_table( pbt_phased_trios_mt_path(data_type, split=False)) pmt = pmt.rename({'PBT_GT': 'PGT'}) # ugly but supported by hl.split_multi_hts pmt = hl.split_multi_hts(pmt) pmt = pmt.rename({'PGT': 'PBT_GT'}) pmt.write(pbt_phased_trios_mt_path(data_type), overwrite=args.overwrite) if args.phase_multi_families: pbt = hl.read_matrix_table(pbt_phased_trios_mt_path(data_type)) # Keep samples that: # 1. There are more than one entry in the Matrix (i.e. they are part of multiple trios) # 2. In all their entries, the parents are the same (there are only two exceptions to this, so best to ignore these and focus on parents/multi-offspring families) nt_samples = pbt.cols() nt_samples = nt_samples.group_by('s').aggregate( trios=hl.agg.collect(nt_samples.source_trio)) nt_samples = nt_samples.filter( (hl.len(nt_samples.trios) > 1) & nt_samples.trios[1:].any(lambda x: (x.mother.s != nt_samples.trios[ 0].mother.s) | (x.father.s != nt_samples.trios[0].father.s)), keep=False) pbt = pbt.filter_cols(hl.is_defined(nt_samples[pbt.col_key])) # Group cols for these samples, keeping all GTs in an array # Compute the consensus GT (incl. phase) + QC metrics based on (a) phased genotypes have priority, (b) genotypes with most votes pbt = pbt.group_cols_by('s').aggregate(PBT_GTs=hl.agg.filter( hl.is_defined(pbt.GT), hl.agg.collect(pbt.GT))) gt_counter = hl.sorted(hl.array( pbt.PBT_GTs.group_by(lambda x: x).map_values(lambda x: hl.len(x))), key=lambda x: x[0].phased * 100 + x[1], reverse=True) phased_gt_counts = gt_counter.filter(lambda x: x[0].phased).map( lambda x: x[1]) pbt = pbt.annotate_entries( consensus_gt=gt_counter.map(lambda x: x[0]).find(lambda x: True), phase_concordance=phased_gt_counts.find(lambda x: True) / hl.sum(phased_gt_counts), discordant_gts=hl.len( hl.set( pbt.PBT_GTs.map(lambda x: hl.cond( x.phased, hl.call(x[0], x[1]), x)))) > 1) pbt.write('gs://gnomad/projects/compound_hets/pbt_multi_families.mt')