예제 #1
0
def main(args):
    hl.init()

    data_type = 'genomes' if args.genomes else 'exomes'

    if args.write_hardcalls:
        mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None)
        ht = hl.read_table(qc_ht_path(data_type, 'hard_filters'))
        mt = annotate_adj(
            mt.select_cols(sex=ht[hl.literal(data_type), mt.s].sex))
        mt = mt.select_entries(GT=hl.case(missing_false=True).when(
            hl.call(mt.PGT[0], mt.PGT[1]) == mt.GT, mt.PGT).default(mt.GT),
                               PID=mt.PID,
                               adj=mt.adj)
        mt = adjust_sex_ploidy(mt, mt.sex)
        mt = mt.select_cols().naive_coalesce(10000)
        mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=False),
                 args.overwrite)

    if args.split_hardcalls:
        mt = get_gnomad_data(data_type, split=False, meta_root=None)
        mt = hl.split_multi_hts(mt)
        mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=True),
                 args.overwrite)

    if args.write_nonrefs:  # CPU-hours: 600 (E)
        mt = get_gnomad_data(data_type, split=False, raw=True,
                             meta_root=None).select_cols()
        mt = mt.annotate_entries(is_missing=hl.is_missing(mt.GT))
        mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref())
        mt = annotate_adj(mt)
        if args.exomes:
            mt = mt.naive_coalesce(10000)
        mt.write(
            get_gnomad_data_path(data_type, split=False, non_refs_only=True),
            args.overwrite)

    if args.split_nonrefs:  # CPU-hours: 300 (E)
        mt = get_gnomad_data(data_type, split=False, non_refs_only=True)
        mt = hl.split_multi_hts(mt)
        mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref())
        mt.write(
            get_gnomad_data_path(data_type, split=True, non_refs_only=True),
            args.overwrite)
예제 #2
0
def main(args):
    hl.init(log='/platform_pca.log')

    if not args.skip_prepare_data_for_platform_pca:
        # ~1 hour on 800 cores (3/8/18)
        logger.info('Preparing data for platform PCA...')
        mt = get_gnomad_data('exomes',
                             adj=True,
                             raw=False,
                             meta_root=None,
                             fam_root=None,
                             split=False)
        mt = filter_to_autosomes(mt)
        intervals = hl.import_locus_intervals(evaluation_intervals_path)
        mt = mt.annotate_rows(interval=intervals[mt.locus].target)
        mt = mt.filter_rows(
            hl.is_defined(mt.interval) & (hl.len(mt.alleles) == 2))
        mt = mt.select_entries(
            GT=hl.or_missing(hl.is_defined(mt.GT), hl.struct()))
        callrate_mt = mt.group_rows_by(mt.interval).aggregate(
            callrate=hl.agg.fraction(hl.is_defined(mt.GT)))
        callrate_mt.write(exome_callrate_mt_path, args.overwrite)

    if not args.skip_run_platform_pca:
        logger.info('Running platform PCA...')
        qc_ht = hl.read_table(qc_ht_path('exomes', 'hard_filters')).key_by('s')
        callrate_mt = hl.read_matrix_table(exome_callrate_mt_path)
        callrate_mt = callrate_mt.filter_cols(
            hl.len(qc_ht[callrate_mt.col_key].hard_filters) == 0)
        callrate_mt = callrate_mt.annotate_entries(callrate=hl.int(
            callrate_mt.callrate > 0.25))
        # Center until Hail's PCA does it for you
        callrate_mt = callrate_mt.annotate_rows(
            mean_callrate=hl.agg.mean(callrate_mt.callrate))
        callrate_mt = callrate_mt.annotate_entries(
            callrate=callrate_mt.callrate - callrate_mt.mean_callrate)
        eigenvalues, scores, _ = hl.pca(callrate_mt.callrate,
                                        compute_loadings=False)
        logger.info('Eigenvalues: {}'.format(eigenvalues))
        # [731282566.2824697, 78687228.90071851, 43837650.51729764, 33969298.61827205, 26308703.539534636, 21102437.512725923, 16949828.555817757, 12994894.187041137, 8372332.274295175, 8128326.814388647]
        scores.write(exome_callrate_scores_ht_path)

    logger.info(
        'Annotating with platform PCs and known platform annotations...')
    scores = hl.read_table(exome_callrate_scores_ht_path).annotate(
        data_type='exomes')
    if args.pc_scores_in_separate_fields:
        scores = scores.transmute(scores=[
            scores[ann] for ann in sorted(
                [ann for ann in scores.row if ann.startswith("PC")],
                key=lambda x: int(x[2:]))
        ])
    platform_pcs = assign_platform_pcs(scores)
    platform_pcs.write(qc_ht_path('exomes', 'platforms'),
                       overwrite=args.overwrite)
예제 #3
0
def get_adj_missing_mt(data_type: str, pbt: bool) -> hl.MatrixTable:
    mt = get_gnomad_data(data_type).select_cols() if not pbt else hl.read_matrix_table(pbt_phased_trios_mt_path(data_type))
    mt = mt.select_rows()
    mt = mt.select_entries(
        GT=hl.or_missing(mt.GT.is_non_ref(), mt.GT),
        missing=hl.is_missing(mt.GT),
        adj=mt.adj
    ).select_cols().select_rows()

    if pbt:
        mt = mt.key_cols_by('s', trio_id=mt.source_trio.id)
        mt = extract_pbt_probands(mt, data_type)
        mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))
        mt = mt.key_cols_by(s=mt.s, trio_id=mt.source_trio.id)
    else:
        meta = get_gnomad_meta('exomes')
        mt = mt.filter_cols(meta[mt.col_key].high_quality)

    return mt
예제 #4
0
def main(args):
    hl.init()
    data_type = "genomes" if args.genomes else "exomes"

    if not args.skip_write_qc_mt:
        logger.info("Importing data...")
        # 1h40 for exomes, 3h20 for genomes
        mt = get_gnomad_data(
            data_type, raw=True, split=False
        )  # NOTE: using full calls since hardcalls doesn't exist at this stage
        logger.info(
            "Filtering to bi-allelic, high-callrate, common SNPs for sample QC..."
        )
        mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1])
                            & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > 0.001)
                            & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99))
        mt.annotate_cols(callrate=hl.agg.fraction(hl.is_defined(
            mt.GT))).naive_coalesce(5000).write(qc_mt_path(data_type),
                                                overwrite=args.overwrite)
    qc_mt = hl.read_matrix_table(qc_mt_path(data_type))

    logger.info("Importing metadata...")
    meta_ht = hl.import_table(qc_meta_path(data_type),
                              impute=True,
                              types={
                                  'age': hl.tfloat64
                              }).key_by('s')
    qc_mt = qc_mt.annotate_cols(**meta_ht[qc_mt.s])

    logger.info("Inferring sex...")
    qc_ht = annotate_sex(qc_mt,
                         qc_temp_data_prefix(data_type),
                         male_threshold=0.8 if args.genomes else 0.6).cols()
    # Flag Klinefelter's individuals and samples with sex aneuploidies
    if args.exomes:
        qc_ht = qc_ht.annotate(
            ambiguous_sex=((qc_ht.f_stat >= 0.5) &
                           (hl.is_defined(qc_ht.normalized_y_coverage) &
                            (qc_ht.normalized_y_coverage <= 0.1))) |
            (hl.is_missing(qc_ht.f_stat)) |
            ((qc_ht.f_stat >= 0.4) & (qc_ht.f_stat <= 0.6) &
             (hl.is_defined(qc_ht.normalized_y_coverage) &
              (qc_ht.normalized_y_coverage > 0.1))),
            sex_aneuploidy=(qc_ht.f_stat < 0.4)
            & hl.is_defined(qc_ht.normalized_y_coverage) &
            (qc_ht.normalized_y_coverage > 0.1))
    else:
        qc_ht = qc_ht.annotate(ambiguous_sex=hl.is_missing(qc_ht.is_female))

    logger.info("Annotating samples failing hard filters...")
    if args.exomes:
        sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when(
            qc_ht.sex_aneuploidy,
            "sex_aneuploidy").when(qc_ht.is_female, "female").default("male"))
    else:
        sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when(
            qc_ht.is_female, "female").default("male"))
    qc_ht = qc_ht.annotate(
        hard_filters=make_hard_filters_expr(qc_ht, data_type),
        perm_filters=make_perm_filters_expr(qc_ht, data_type),
        sex=sex_expr,
        data_type=data_type).key_by('data_type', 's')
    qc_ht.write(qc_ht_path(data_type, 'hard_filters'),
                overwrite=args.overwrite)

    # Export annotations to make rank list for relatedness (in final sample QC)
    if args.exomes:
        colnames = ['internal', 'project_id', 'pct_bases_20x', 'perm_filters']
    else:
        colnames = ['pcr_free', 'mean_dp', 'perm_filters']
    rank_ht = qc_ht.filter(hl.len(qc_ht.hard_filters) == 0,
                           keep=True).select(*colnames)
    (rank_ht.annotate(releasable=(
        hl.len(rank_ht.perm_filters) == 0)).drop('perm_filters').export(
            rank_annotations_path(data_type)))

    # Check numbers:
    qc_ht = hl.read_table(qc_ht_path(data_type, 'hard_filters'))
    sample_count = qc_ht.count()
    checkpoint1a = qc_ht.aggregate(
        hl.agg.count_where(hl.len(qc_ht['hard_filters']) == 0))
    checkpoint1b = qc_ht.aggregate(
        hl.agg.count_where((hl.len(qc_ht['hard_filters']) == 0)
                           & (hl.len(qc_ht.perm_filters) == 0)))
    logger.info('{} samples found before filtering'.format(sample_count))
    logger.info('{} samples found after checkpoint 1a (hard filters)'.format(
        checkpoint1a))
    logger.info(
        '{} samples found after checkpoint 1b (hard filters + permissions)'.
        format(checkpoint1b))
예제 #5
0
def create_meta(related_data: GnomADRelatedData, fake_fam_prop: float,
                old_version: str, overwrite: bool) -> None:
    """
    Creates and writes a dataframe with metadata to evaluate gnomAD trios from the raw ped file.
    In order to compare the raw ped, metadata is also generated for:
    1) A number of fake families are generated
    2) The previous iteration of the ped file (old_version)

    :param GnomADRelatedData related_data: Input data
    :param float fake_fam_prop: Number of fake trios to generate as a proportion of the number of real families in the data
    :param str old_version: Version of previous iteration to load
    :param bool overwrite: Whether to overwrite previous data
    :return: Nothing
    :rtype: None
    """

    raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type),
                               delimiter="\\t")

    n_fake_trios = int(fake_fam_prop * len(raw_ped.complete_trios()))
    logger.info(
        f"Generating fake pedigree with {n_fake_trios} trios for {related_data.data_type}"
    )
    fake_fams = create_fake_pedigree(n_fake_trios,
                                     list(related_data.meta_pd.s), raw_ped)

    fake_fams.write(fake_fam_path(related_data.data_type))

    logger.info(f"Running mendel_errors on {related_data.data_type}")

    # Run mendel errors on families made of random samples to establish expectation in non-trios:
    pedigrees = [('new', raw_ped),
                 ('old',
                  hl.Pedigree.read(fam_path(related_data.data_type,
                                            version=old_version),
                                   delimiter="\\t")),
                 ('fake',
                  hl.Pedigree.read(fake_fam_path(related_data.data_type),
                                   delimiter="\\t"))]

    ped_pd = merge_pedigree_pandas([(name, ped_to_pandas(ped))
                                    for name, ped in pedigrees],
                                   related_data.sample_to_dups, True)

    # Run mendel_errors
    all_ped = pandas_to_ped(ped_pd)
    gnomad = get_gnomad_data(related_data.data_type)
    fam_samples = hl.literal({
        s
        for trio in all_ped.trios for s in [trio.s, trio.mat_id, trio.pat_id]
    })
    gnomad = gnomad.filter_cols(fam_samples.contains(gnomad.s))
    all_errors, per_fam, per_sample, _ = hl.mendel_errors(
        gnomad['GT'], all_ped)

    all_errors.write(sample_qc_mendel_ht_path(related_data.data_type,
                                              "all_errors"),
                     overwrite=overwrite)
    per_fam.write(sample_qc_mendel_ht_path(related_data.data_type, "per_fam"),
                  overwrite=overwrite)
    per_sample.write(sample_qc_mendel_ht_path(related_data.data_type,
                                              "per_sample"),
                     overwrite=overwrite)

    # Merge all metadata
    ped_pd = add_pedigree_meta(ped_pd=ped_pd,
                               meta_pd=related_data.meta_pd,
                               kin_ht=related_data.kin_ht,
                               mendel_per_sample_ht=per_sample)

    # Write merged pedigrees as HT
    sql_context = SQLContext(hl.spark_context())
    hl.Table.from_spark(sql_context.createDataFrame(ped_pd)).write(
        merged_pedigrees_ht_path(related_data.data_type), overwrite=overwrite)
def main(args):
    data_type = 'exomes' if args.exomes else 'genomes'

    if args.pbt_tm:
        mt = get_gnomad_data(data_type, split=False)
        meta = mt.cols()
        hq_samples = meta.aggregate(
            hl.agg.filter(meta.meta.high_quality, hl.agg.collect(meta.s)))
        ped = hl.Pedigree.read(fam_path(data_type),
                               delimiter='\\t').filter_to(hq_samples)
        ped_samples = hl.literal(
            set([
                s for trio in ped.complete_trios()
                for s in [trio.s, trio.pat_id, trio.mat_id]
            ]))

        mt = mt.filter_cols(ped_samples.contains(mt.s))
        mt = mt.select_cols().select_rows()
        mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

        tm = hl.trio_matrix(mt, ped, complete_trios=True)
        tm = hl.experimental.phase_trio_matrix_by_transmission(tm)
        tm.write(pbt_phased_trios_mt_path(data_type,
                                          split=False,
                                          trio_matrix=True),
                 overwrite=args.overwrite)

    if args.pbt_explode:
        tm = hl.read_matrix_table(
            pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True))

        tm = tm.annotate_entries(trio_adj=tm.proband_entry.adj
                                 & tm.father_entry.adj & tm.mother_entry.adj)
        pmt = explode_trio_matrix(tm, keep_trio_entries=True)
        pmt = pmt.transmute_entries(trio_adj=pmt.source_trio_entry.trio_adj)
        pmt.write(pbt_phased_trios_mt_path(data_type, split=False),
                  overwrite=args.overwrite)

        pmt = hl.read_matrix_table(
            pbt_phased_trios_mt_path(data_type, split=False))
        pmt = pmt.rename({'PBT_GT':
                          'PGT'})  # ugly but supported by hl.split_multi_hts
        pmt = hl.split_multi_hts(pmt)
        pmt = pmt.rename({'PGT': 'PBT_GT'})
        pmt.write(pbt_phased_trios_mt_path(data_type),
                  overwrite=args.overwrite)

    if args.phase_multi_families:
        pbt = hl.read_matrix_table(pbt_phased_trios_mt_path(data_type))
        # Keep samples that:
        # 1. There are more than one entry in the Matrix (i.e. they are part of multiple trios)
        # 2. In all their entries, the parents are the same (there are only two exceptions to this, so best to ignore these and focus on parents/multi-offspring families)
        nt_samples = pbt.cols()
        nt_samples = nt_samples.group_by('s').aggregate(
            trios=hl.agg.collect(nt_samples.source_trio))
        nt_samples = nt_samples.filter(
            (hl.len(nt_samples.trios) > 1) &
            nt_samples.trios[1:].any(lambda x: (x.mother.s != nt_samples.trios[
                0].mother.s) | (x.father.s != nt_samples.trios[0].father.s)),
            keep=False)
        pbt = pbt.filter_cols(hl.is_defined(nt_samples[pbt.col_key]))

        # Group cols for these samples, keeping all GTs in an array
        # Compute the consensus GT (incl. phase) + QC metrics based on (a) phased genotypes have priority, (b) genotypes with most votes
        pbt = pbt.group_cols_by('s').aggregate(PBT_GTs=hl.agg.filter(
            hl.is_defined(pbt.GT), hl.agg.collect(pbt.GT)))
        gt_counter = hl.sorted(hl.array(
            pbt.PBT_GTs.group_by(lambda x: x).map_values(lambda x: hl.len(x))),
                               key=lambda x: x[0].phased * 100 + x[1],
                               reverse=True)
        phased_gt_counts = gt_counter.filter(lambda x: x[0].phased).map(
            lambda x: x[1])
        pbt = pbt.annotate_entries(
            consensus_gt=gt_counter.map(lambda x: x[0]).find(lambda x: True),
            phase_concordance=phased_gt_counts.find(lambda x: True) /
            hl.sum(phased_gt_counts),
            discordant_gts=hl.len(
                hl.set(
                    pbt.PBT_GTs.map(lambda x: hl.cond(
                        x.phased, hl.call(x[0], x[1]), x)))) > 1)
        pbt.write('gs://gnomad/projects/compound_hets/pbt_multi_families.mt')
def main(args):

    hl.init(log="/tmp/hail_comp_vp.log")
    data_type = 'exomes' if args.exomes else 'genomes'

    if args.create_full_vp:
        logger.info(
            f"Generating gnomAD VP MT for PBT VPs, excluding PBT samples.")
        # Load PBT VP MT
        pbt_vp_mt = hl.read_matrix_table(
            full_mt_path(data_type, True, args.least_consequence,
                         args.max_freq, args.chrom))

        # Get all PBT trio ids
        pbt_samples = get_pbt_trio_ht(data_type).key_by('s')

        mt = get_gnomad_data(data_type)
        mt = mt.select_entries(GT=hl.or_missing(mt.GT.is_non_ref(), mt.GT),
                               PID=mt.PID,
                               missing=hl.is_missing(mt.GT),
                               adj=mt.adj).select_cols().select_rows()
        meta = get_gnomad_meta('exomes')
        mt = mt.filter_cols(meta[mt.col_key].release
                            & hl.is_missing(pbt_samples[mt.col_key]))
        vp_mt = create_full_vp(mt,
                               vp_list_ht=pbt_vp_mt.rows(),
                               data_type=data_type)
        vp_mt = vp_mt.checkpoint(pbt_comparison_full_mt_path(
            data_type=data_type,
            least_consequence=args.least_consequence,
            max_freq=args.max_freq,
            chrom=args.chrom),
                                 overwrite=args.overwrite)

        logger.info("Total sample count after PBT filtering: %d",
                    vp_mt.count_cols())

    if args.create_vp_summary:
        logger.info("Creating VP summary")
        mt = hl.read_matrix_table(
            pbt_comparison_full_mt_path(
                data_type=data_type,
                least_consequence=args.least_consequence,
                max_freq=args.max_freq,
                chrom=args.chrom))
        meta = get_gnomad_meta(data_type).select('pop', 'release')
        mt = mt.annotate_cols(**meta[mt.col_key])
        ht = create_vp_summary(mt)
        ht = ht.checkpoint(pbt_comparison_vp_count_ht_path(
            data_type=data_type,
            least_consequence=args.least_consequence,
            max_freq=args.max_freq,
            chrom=args.chrom),
                           overwrite=args.overwrite,
                           _read_if_exists=not args.overwrite)

        logger.info("Phasing VP summary")
        ht = get_phased_gnomad_ht(ht)
        ht.write(pbt_comparison_phased_vp_count_ht_path(
            data_type=data_type,
            least_consequence=args.least_consequence,
            max_freq=args.max_freq,
            chrom=args.chrom),
                 overwrite=args.overwrite)

    if args.export:
        data_type = 'exomes'
        pbt = hl.read_table(pbt_phase_count_ht_path(data_type, pbt=True))
        pbt_ann = hl.read_table(vp_ann_ht_path(data_type, pbt=True))
        pbt = pbt.annotate(**pbt_ann[pbt.locus1, pbt.alleles1, pbt.locus2,
                                     pbt.alleles2],
                           distance=pbt.locus2.position - pbt.locus1.position)

        discordant_within_pop_expr = hl.array(pbt.phase_by_pop).any(lambda x: (
            x[0] != 'all') & (x[1].adj.n_same_hap > 0) & (x[1].adj.n_chet > 0))
        pbt = pbt.annotate(phase_by_pop=hl.array(pbt.phase_by_pop),
                           discordant_within_pop=discordant_within_pop_expr,
                           discordant_between_pops=~discordant_within_pop_expr
                           & (pbt.phase_by_pop['all'].adj.n_same_hap > 0) &
                           (pbt.phase_by_pop['all'].adj.n_chet > 0),
                           **pbt_ann[pbt.key],
                           distance=pbt.locus2.position - pbt.locus1.position)
        pbt = pbt.explode('phase_by_pop')
        pbt = pbt.transmute(pop=pbt.phase_by_pop[0], **pbt.phase_by_pop[1])
        # Drop RF-filtered sites
        pbt = pbt.filter((hl.len(pbt.filters1) == 0)
                         & (hl.len(pbt.filters2) == 0))
        # Filter sites that are in LCR / Decoy
        pbt = pbt.filter(pbt.lcr1 | pbt.lcr2 | pbt.decoy1 | pbt.decoy2
                         | pbt.segdup1 | pbt.segdup2,
                         keep=False)

        # Drop sites with inconsistent trio phasing?
        pbt = pbt.filter(~pbt.discordant_within_pop)
        # pbt = pbt.filter(pbt.adj.n_same_hap + pbt.adj.n_chet > 0)
        # Drop sites that are too frequent in a given pop
        pbt = pbt.filter((pbt.freq1[pbt.pop].AF <= 0.05)
                         & (pbt.freq2[pbt.pop].AF <= 0.05))
        # Drop sites that really are het non-ref
        pbt = pbt.filter(pbt.distance > 0)
        # filter to autosomes
        autosomes = hl.parse_locus_interval('1-22')
        pbt = pbt.filter(autosomes.contains(pbt.locus1))
        phase_ht = hl.read_table(
            pbt_comparison_phased_vp_count_ht_path(
                data_type=data_type,
                least_consequence=args.least_consequence,
                max_freq=args.max_freq,
                chrom=args.chrom))
        pbt = pbt.annotate(trio_chet=hl.struct(
            raw=hl.case().when(
                (pbt.raw.n_same_hap > 0) & (pbt.raw.n_chet == 0), False).when(
                    (pbt.raw.n_same_hap == 0) & (pbt.raw.n_chet > 0),
                    True).or_missing(),
            adj=hl.case().when(
                (pbt.adj.n_same_hap > 0) & (pbt.adj.n_chet == 0), False).when(
                    (pbt.adj.n_same_hap == 0) & (pbt.adj.n_chet > 0),
                    True).or_missing()),
                           **phase_ht[pbt.locus1, pbt.alleles1, pbt.locus2,
                                      pbt.alleles2].phase_info[pbt.pop])

        pbt = pbt.filter(~hl.is_nan(pbt.gt_counts.adj[0])
                         & (pbt.pop != 'oth')).key_by()
        rf_features = {
            'snv1': pbt.snv1,
            'snv2': pbt.snv2,
            'cpg1': hl.or_else(pbt.cpg1, False),
            'cpg2': hl.or_else(pbt.cpg2, False),
            'distance': pbt.distance
        }
        rf_features.update({
            f'n{i}{j}': pbt.gt_counts.adj[(3 * i) + j]
            for i in [0, 1, 2] for j in [0, 1, 2]
        })

        ac1 = get_ac_from_gt_counts(pbt.gt_counts.adj, True)
        ac2 = get_ac_from_gt_counts(pbt.gt_counts.adj, False)
        an = 2 * hl.sum(pbt.gt_counts.adj)
        pbt_df = pbt.select(
            locus1=pbt.locus1,
            ref1=pbt.alleles1[0],
            alt1=pbt.alleles1[1],
            locus2=pbt.locus2,
            ref2=pbt.alleles2[0],
            alt2=pbt.alleles2[1],
            pop=pbt.pop,
            trio_chet=pbt.trio_chet.adj,
            em=pbt.em.adj.p_chet,
            singlet_het_ratio=pbt.singlet_het_ratio.adj,
            lr=pbt.likelihood_model.adj,
            AC1=ac1,
            AC2=ac2,
            AF1=ac1 / an,
            AF2=ac2 / an,
            n_var_gnomad=(ac1 > 0) + (ac2 > 0),
            discordant_between_pops=pbt.discordant_between_pops,
            discordant_within_pop=pbt.discordant_within_pop,
            **rf_features
        ).flatten().to_pandas(
        )  # NOTE: The serialization to pandas happens because this code comes from a notebook initially

        with hl.utils.hadoop_open(
                'gs://gnomad/projects/compound_hets/pbt_annotated.csv',
                'w') as f:
            pbt_df.to_csv(f)
예제 #8
0
def main(args):
    hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/')

    if not args.load_joint_pruned_qc_mt:
        logger.info('Joining exomes and genomes...')
        exome_qc_mt = read_and_pre_process_data(
            qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters'))
        genome_qc_mt = read_and_pre_process_data(
            qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters'))

        joint_qc_mt = exome_qc_mt.union_cols(
            genome_qc_mt)  # NOTE: this is an inner join on rows
        joint_qc_mt = joint_qc_mt.filter_rows(
            (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99))
        joint_qc_mt.write(qc_mt_path('joint'), args.overwrite)

        logger.info('LD-pruning joint mt of exomes and genomes...')
        joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint'))
        variants, samples = joint_qc_mt.count()
        logger.info('Pruning {0} variants in {1} samples'.format(
            variants, samples))
        joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1)
        # Note writing the LD-pruned MT is probably overkill
        # vs using `filter_rows` to filter sites based on the LD-pruned HT.
        joint_qc_pruned_mt = joint_qc_mt.filter_rows(
            hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key]))
        joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True),
                                 args.overwrite)

    pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True))
    variants, samples = pruned_mt.count()
    logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format(
        samples, variants))

    if not args.skip_pc_relate:
        logger.info('Running PCA for PC-Relate...')
        eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht',
            args.overwrite)

        logger.info('Running PC-Relate...')
        scores = hl.read_table(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht')
        # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes
        relatedness_ht = hl.pc_relate(
            pruned_mt.GT,
            min_individual_maf=0.05,
            scores_expr=scores[pruned_mt.col_key].scores,
            block_size=4096,
            min_kinship=0.05,
            statistics='kin2')
        relatedness_ht.write(relatedness_ht_path, args.overwrite)

    relatedness_ht = hl.read_table(relatedness_ht_path)

    if not args.skip_relatedness:
        infer_ped(GnomADRelatedData('exomes'))
        infer_ped(GnomADRelatedData('genomes'))

        logger.info('Making rank file...')
        rank_table = make_rank_file(rank_annotations_path('joint'))
        logger.info('Finished making rank file...')

        related_samples_to_drop_ranked = get_related_samples_to_drop(
            rank_table, relatedness_ht)
        related_samples_to_drop_ranked.write(
            qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht',
            args.overwrite)

    pca_mt, related_mt = split_mt_by_relatedness(pruned_mt)

    if not args.skip_pop_pca:
        variants, samples = pca_mt.count()
        logger.info('{} samples after removing relateds'.format(samples))
        # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below
        plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' +
                                        pca_mt.s.replace(" ", "_")).replace(
                                            "/", "_").key_cols_by('uid')
        hl.export_plink(plink_mt,
                        qc_temp_data_prefix('joint') + '.unrelated.plink',
                        fam_id=plink_mt.uid,
                        ind_id=plink_mt.uid)

        logger.info(
            'Computing population PCs and annotating with known population labels...'
        )
        pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
            pca_mt.GT, k=20, compute_loadings=True)
        pca_af_ht = pca_mt.annotate_rows(
            pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
        pca_loadings = pca_loadings.annotate(
            pca_af=pca_af_ht[pca_loadings.key].pca_af)
        pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite)
        pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite)

    pca_scores = hl.read_table(ancestry_pca_scores_ht_path())
    pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path())
    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    logger.info(
        'Projecting population PCs for {} related samples...'.format(samples))
    related_scores = pc_project(related_mt, pca_loadings)
    relateds = related_mt.cols()
    relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    logger.info('Assigning population annotations...')
    pop_colnames = ['related', 'known_pop', 'scores']
    pop_annots_ht = hl.import_table(known_population_annotations,
                                    impute=True).key_by('combined_sample')

    joint_ht = pca_mt.cols().union(relateds)
    joint_ht = joint_ht.annotate(
        known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' +
                                joint_ht.s.replace(' ', '_')].known_pop
    )  # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed
    joint_pca_ht = joint_ht.select(*pop_colnames)
    joint_pca_ht, joint_pca_fit = run_assign_population_pcs(
        joint_pca_ht,
        qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz',
        qc_temp_data_prefix('joint') + '.RF_fit.pkl',
        pcs=list(range(1, 7)))
    joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select(
        'pop', *pop_colnames)

    # Add special Estonian pop category for genomes
    estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate(
        data_type='genomes').key_by('data_type', 'sample'))
    joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch)
    joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when(
        hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1'
    ).when(hl.is_defined(joint_ht.pop)
           & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist()

    # These are keyed by only `s`
    genome_mt = get_gnomad_data('genomes',
                                adj=False,
                                split=False,
                                meta_root=None).select_cols()
    exome_mt = get_gnomad_data('exomes',
                               adj=False,
                               split=False,
                               meta_root=None).select_cols()

    # Population-specific filtering
    if not args.skip_calculate_sample_metrics:
        logger.info(
            'Running mini sample QC for platform- and population-specific filtering...'
        )
        gnomad_sample_qc(exome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite)
        gnomad_sample_qc(genome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite)
        # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet

    logger.info('Annotating population and platform assignments...')
    platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms'))
    exome_ht = exome_mt.cols()
    exome_ht = exome_ht.annotate(
        qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s])

    genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters'))
    genome_ht = genome_mt.cols()
    genome_ht = genome_ht.annotate(
        qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s])

    exome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('exomes') + '.sample_qc.ht')
    genome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('genomes') + '.sample_qc.ht')

    exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s])
    genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s])

    # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev
    logger.info(
        'Calculating platform- and population-specific sample QC thresholds...'
    )
    exome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    exome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform'])
    exome_ht = exome_ht.annotate_globals(
        hl.eval(exome_pop_platform_filter_ht.globals))
    exome_ht = exome_ht.annotate(
        **exome_pop_platform_filter_ht[exome_ht.key]).persist()

    genome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    genome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform'])
    genome_ht = genome_ht.annotate_globals(
        hl.eval(genome_pop_platform_filter_ht.globals))
    genome_ht = genome_ht.annotate(
        **genome_pop_platform_filter_ht[genome_ht.key]).persist()

    # Annotate samples that fail their respective filters
    checkpoint = exome_ht.aggregate(
        hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} exome samples found passing pop/platform-specific filtering'
    )
    exome_ht.key_by(data_type='exomes',
                    s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'),
                                        args.overwrite)

    checkpoint = genome_ht.aggregate(
        hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} genome samples found passing pop/platform-specific filtering'
    )
    genome_ht.key_by(data_type='genomes', s=genome_ht.s).write(
        qc_ht_path('genomes', 'pop_platform'), args.overwrite)
예제 #9
0
def compute_from_full_mt(chr20: bool, overwrite: bool):
    mt = get_gnomad_data('exomes', adj=True, release_samples=True)
    freq_ht = hl.read_table(annotations_ht_path('exomes', 'frequencies'))
    vep_ht = hl.read_table(annotations_ht_path('exomes', 'vep'))
    rf_ht = hl.read_table(annotations_ht_path('exomes', 'rf'))

    if chr20:
        mt, freq_ht, vep_ht, rf_ht = filter_to_chr20([mt, freq_ht, vep_ht, rf_ht])

    vep_ht = vep_ht.annotate(
        vep=get_worst_gene_csq_code_expr(vep_ht.vep).values()
    )

    freq_ht = freq_ht.select(
        freq=freq_ht.freq[:10],
        popmax=freq_ht.popmax
    )
    freq_meta = hl.eval(freq_ht.globals.freq_meta)
    freq_dict = {f['pop']: i for i, f in enumerate(freq_meta[:10]) if 'pop' in f}
    freq_dict['all'] = 0
    freq_dict = hl.literal(freq_dict)
    mt = mt.annotate_rows(
        **freq_ht[mt.row_key],
        vep=vep_ht[mt.row_key].vep,
        filters=rf_ht[mt.row_key].filters
    )
    mt = mt.filter_rows(
        (mt.freq[0].AF <= MAX_FREQ) &
        (hl.len(mt.vep) > 0) &
        (hl.len(mt.filters) == 0)
    )

    mt = mt.filter_entries(mt.GT.is_non_ref())
    mt = mt.select_entries(
        is_het=mt.GT.is_het()
    )

    mt = mt.explode_rows(mt.vep)
    mt = mt.transmute_rows(**mt.vep)

    mt = mt.annotate_cols(
        pop=['all', mt.meta.pop]
    )
    mt = mt.explode_cols(mt.pop)

    mt = mt.group_rows_by(
        'gene_id'
    ).aggregate_rows(
        gene_symbol=hl.agg.take(mt.gene_symbol, 1)[0]
    ).aggregate(
        counts=hl.agg.filter(
            hl.if_else(
                mt.pop == 'all',
                hl.is_defined(mt.popmax) & (mt.popmax.AF <= MAX_FREQ),
                mt.freq[freq_dict[mt.pop]].AF <= MAX_FREQ
            ),
            hl.agg.group_by(
                hl.if_else(
                    mt.pop == 'all',
                    mt.popmax.AF > 0.001,
                    mt.freq[freq_dict[mt.pop]].AF > 0.001
                ),
                hl.struct(
                    hom_csq=hl.agg.filter(~mt.is_het, hl.agg.min(mt.csq)),
                    het_csq=hl.agg.filter(mt.is_het, hl.agg.min(mt.csq)),
                    het_het_csq=hl.sorted(
                        hl.array(
                            hl.agg.filter(mt.is_het, hl.agg.counter(mt.csq))
                        ),
                        key=lambda x: x[0]
                    ).scan(
                        lambda i, j: (j[0], i[1] + j[1]),
                        (0, 0)
                    ).find(
                        lambda x: x[1] > 1
                    )[0]
                )
            )
        )
    )

    mt = mt.annotate_entries(
        counts=hl.struct(
            all=hl.struct(
                hom_csq=hl.min(mt.counts.get(True).hom_csq, mt.counts.get(False).hom_csq),
                het_csq=hl.min(mt.counts.get(True).het_csq, mt.counts.get(False).het_csq),
                het_het_csq=hl.min(
                    mt.counts.get(True).het_het_csq,
                    mt.counts.get(False).het_het_csq,
                    hl.or_missing(
                        hl.is_defined(mt.counts.get(True).het_csq) & hl.is_defined(mt.counts.get(False).het_csq),
                        hl.max(mt.counts.get(True).het_csq, mt.counts.get(False).het_csq)
                    )
                ),
            ),
            af_le_0_001=mt.counts.get(False)
        )
    )

    mt = mt.checkpoint('gs://gnomad-tmp/compound_hets/het_and_hom_per_gene{}.1.mt'.format(
        '.chr20' if chr20 else ''
    ), overwrite=True)

    gene_ht = mt.annotate_rows(
        row_counts=hl.flatten([
            hl.array(
                hl.agg.group_by(
                    mt.pop,
                    hl.struct(
                        csq=csq,
                        af=af,
                        n_hom=hl.agg.count_where(mt.counts[af].hom_csq == csq_i),
                        n_het=hl.agg.count_where(mt.counts[af].het_csq == csq_i),
                        n_het_het=hl.agg.count_where(mt.counts[af].het_het_csq == csq_i)
                    )
                )
            ).filter(
                lambda x: (x[1].n_het > 0) | (x[1].n_hom > 0) | (x[1].n_het_het > 0)
            ).map(
                lambda x: x[1].annotate(
                    pop=x[0]
                )
            )
            for csq_i, csq in enumerate(CSQ_CODES)
            for af in ['all', 'af_le_0_001']
        ])
    ).rows()

    gene_ht = gene_ht.explode('row_counts')
    gene_ht = gene_ht.select(
        'gene_symbol',
        **gene_ht.row_counts
    )

    gene_ht.describe()

    gene_ht = gene_ht.checkpoint(
        'gs://gnomad-lfran/compound_hets/het_and_hom_per_gene{}.ht'.format(
            '.chr20' if chr20 else ''
        ),
        overwrite=overwrite
    )

    gene_ht.flatten().export('gs://gnomad-lfran/compound_hets/het_and_hom_per_gene{}.tsv.gz'.format(
        '.chr20' if chr20 else ''
    ))