def write_duplicates(version: str, overwrite: bool) -> None:

    kin_ht = hl.read_table(sample_qc.relatedness_ht_path)
    kin_ht = kin_ht.filter((kin_ht.i.data_type != kin_ht.j.data_type))
    kin_ht = kin_ht.key_by(
        i=kin_ht.i.data_type + "_" + kin_ht.i.s,
        j=kin_ht.j.data_type + "_" + kin_ht.j.s
    )

    dups = get_duplicated_samples(kin_ht)
    dup_combinations = []
    dup_set_id = 0
    for dup_set in dups:
        dup_set_id += 1
        for exome in [x for x in dup_set if x.startswith("exome")]:
            for genome in [x for x in dup_set if x.startswith("genome")]:
                dup_combinations.append((dup_set_id, exome[7:], genome[8:]))

    dup_pd = pd.DataFrame(dup_combinations, columns=['dup_set', 'exomes_s', 'genomes_s'])

    rank_ht = hl.import_table(sample_qc.rank_annotations_path('joint'), impute=True)
    rank_pd = rank_ht.select(rank_ht.data_type, rank_ht.s, rank_ht.rank).to_pandas()
    dup_pd = pd.merge(dup_pd, rank_pd[rank_pd.data_type == 'exomes'], left_on=['exomes_s'], right_on=['s']).rename(columns={'rank': 'exomes_rank'})
    dup_pd = pd.merge(dup_pd, rank_pd[rank_pd.data_type == 'genomes'], left_on=['genomes_s'], right_on=['s']).rename(columns={'rank': 'genomes_rank'})

    dup_pd = dup_pd.groupby('dup_set').apply(lambda df: df.sort_values(['exomes_rank', 'genomes_rank']).reset_index(drop=True))
    dup_pd = dup_pd.reset_index(level=1).rename(columns={'level_1': 'dup_pair_rank'})
    dup_pd = dup_pd.reset_index(level=0, drop=True)

    with hl.hadoop_open(genomes_exomes_duplicate_ids_tsv_path(version), 'w' if overwrite else 'x') as out:
        dup_pd.to_csv(out, sep="\t", index=False)
示例#2
0
def split_mt_by_relatedness(
        pruned_mt: hl.MatrixTable) -> Tuple[hl.MatrixTable, hl.MatrixTable]:
    """
    Split gnomAD Mt into unrelated and related MTs (based on hard-coded data paths)
    """
    related_samples_to_drop_ranked = hl.read_table(
        qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht')
    rank_table = hl.import_table(rank_annotations_path('joint'),
                                 impute=True).key_by('data_type', 's')
    pruned_mt = pruned_mt.annotate_cols(drop_unrelated_ranked=hl.is_defined(
        related_samples_to_drop_ranked[pruned_mt.col_key]),
                                        **rank_table[pruned_mt.col_key])
    pca_mt = pruned_mt.filter_cols(pruned_mt.drop_unrelated_ranked,
                                   keep=False).annotate_cols(related=False)
    related_mt = pruned_mt.filter_cols(pruned_mt.drop_unrelated_ranked,
                                       keep=True).annotate_cols(related=True)
    return pca_mt, related_mt
示例#3
0
def make_rank_file(outfile: str) -> hl.Table:
    """
    Assign a rank describing retention preference in de-duplication and familial pruning to each exome and genome sample
    # NOTE: use order_by() method on Tables only for big datasets -- order_by() is distributed across nodes; whereas pandas sort is all local
    # NOTE: missing annotations (e.g., in `pct_bases_20x`) are prioritized last

    :param str outfile: filepath to tsv containing ranks assigned across genome and exome samples jointly
    :return: Table organized by uid (e.g., 'exome_ABC1234' or 'genome_DEF5678') and global rank
    :rtype: Table
    """

    # Load data
    exome = hl.import_table(rank_annotations_path('exomes'),
                            impute=True).to_pandas()
    genome = hl.import_table(rank_annotations_path('genomes'),
                             impute=True).to_pandas()
    exome_trio = hl.import_table(dup_pedigree_tsv_path('exomes'),
                                 impute=True,
                                 no_header=True).to_pandas()
    genome_trio = hl.import_table(dup_pedigree_tsv_path('genomes'),
                                  impute=True,
                                  no_header=True).to_pandas()

    # Select complete trios
    exome_trio = exome_trio[(exome_trio.pat_id != '0')
                            & (exome_trio.mat_id != '0')]
    genome_trio = genome_trio[(genome_trio.pat_id != '0')
                              & (genome_trio.mat_id != '0')]

    # Sort genomes
    genome['data_type'] = 'genomes'
    # 1 is "parent", 2 is "NA", 3 is "child"
    genome['parent_child'] = [
        1 if x else 2 for x in pd.Series(list(genome['s'])).isin(
            list(genome_trio['pat_id']) + list(genome_trio['mat_id']))
    ]
    genome.loc[pd.Series(list(genome['s'])).isin(list(genome_trio['s'])),
               'parent_child'] = 3
    sorted_genome = genome.sort_values(['pcr_free', 'parent_child', 'mean_dp'],
                                       ascending=[False, True, False])

    # Harmonize column names with exomes
    sorted_genome['internal'] = 'NaN'
    sorted_genome['pct_bases_20x'] = 'NaN'
    sorted_genome['project_id'] = 'NaN'

    # Sort exomes by internal vs external status
    exome['data_type'] = 'exomes'
    exome['parent_child'] = [
        1 if x else 2 for x in pd.Series(list(exome['s'])).isin(
            list(exome_trio['pat_id']) + list(exome_trio['mat_id']))
    ]
    exome.loc[pd.Series(list(exome['s'])).isin(list(exome_trio['s'])),
              'parent_child'] = 3

    exome_internal = exome.loc[exome.internal, ].copy()
    exome_internal['project_id'].replace(to_replace="^RP-",
                                         value="",
                                         regex=True,
                                         inplace=True)
    exome_internal['project_id'].replace(
        to_replace="^C", value="1000", regex=True, inplace=True
    )  # NOTE: C-projects in exomes are more recent/desirable than RP-projects, so boosting these
    exome_internal['project_id'] = pd.to_numeric(exome_internal['project_id'])
    sorted_exome_internal = exome_internal.sort_values(
        ['project_id', 'parent_child', 'pct_bases_20x'],
        ascending=[False, True, False])

    exome_external = exome.loc[~exome.internal, ].copy()
    sorted_exome_external = exome_external.sort_values(
        ['parent_child', 'pct_bases_20x'], ascending=[True, False])
    sorted_exome = pd.concat([sorted_exome_internal, sorted_exome_external])

    # Harmonize column names with genomes
    sorted_exome['pcr_free'] = 'NaN'
    sorted_exome['mean_dp'] = 'NaN'
    sorted_exome = sorted_exome[sorted_genome.columns]

    # Combine and rearrange by permissions
    sorted_data = pd.concat([sorted_genome, sorted_exome])

    releasable = sorted_data.loc[sorted_data.releasable, ].copy()
    nonreleasable = sorted_data.loc[~sorted_data.releasable, ].copy()
    sorted_data = pd.concat([releasable, nonreleasable])
    sorted_data.rename(index=str, columns={'project_id': 'rank_project_id'})
    sorted_data['rank'] = range(1, len(sorted_data) + 1)

    with hl.hadoop_open(outfile, 'w') as out:
        sorted_data.to_csv(out, sep="\t", index=False)
    new_data = hl.import_table(outfile, impute=True).key_by('data_type', 's')
    return new_data
示例#4
0
def main(args):
    hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/')

    if not args.load_joint_pruned_qc_mt:
        logger.info('Joining exomes and genomes...')
        exome_qc_mt = read_and_pre_process_data(
            qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters'))
        genome_qc_mt = read_and_pre_process_data(
            qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters'))

        joint_qc_mt = exome_qc_mt.union_cols(
            genome_qc_mt)  # NOTE: this is an inner join on rows
        joint_qc_mt = joint_qc_mt.filter_rows(
            (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99))
        joint_qc_mt.write(qc_mt_path('joint'), args.overwrite)

        logger.info('LD-pruning joint mt of exomes and genomes...')
        joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint'))
        variants, samples = joint_qc_mt.count()
        logger.info('Pruning {0} variants in {1} samples'.format(
            variants, samples))
        joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1)
        # Note writing the LD-pruned MT is probably overkill
        # vs using `filter_rows` to filter sites based on the LD-pruned HT.
        joint_qc_pruned_mt = joint_qc_mt.filter_rows(
            hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key]))
        joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True),
                                 args.overwrite)

    pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True))
    variants, samples = pruned_mt.count()
    logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format(
        samples, variants))

    if not args.skip_pc_relate:
        logger.info('Running PCA for PC-Relate...')
        eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht',
            args.overwrite)

        logger.info('Running PC-Relate...')
        scores = hl.read_table(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht')
        # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes
        relatedness_ht = hl.pc_relate(
            pruned_mt.GT,
            min_individual_maf=0.05,
            scores_expr=scores[pruned_mt.col_key].scores,
            block_size=4096,
            min_kinship=0.05,
            statistics='kin2')
        relatedness_ht.write(relatedness_ht_path, args.overwrite)

    relatedness_ht = hl.read_table(relatedness_ht_path)

    if not args.skip_relatedness:
        infer_ped(GnomADRelatedData('exomes'))
        infer_ped(GnomADRelatedData('genomes'))

        logger.info('Making rank file...')
        rank_table = make_rank_file(rank_annotations_path('joint'))
        logger.info('Finished making rank file...')

        related_samples_to_drop_ranked = get_related_samples_to_drop(
            rank_table, relatedness_ht)
        related_samples_to_drop_ranked.write(
            qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht',
            args.overwrite)

    pca_mt, related_mt = split_mt_by_relatedness(pruned_mt)

    if not args.skip_pop_pca:
        variants, samples = pca_mt.count()
        logger.info('{} samples after removing relateds'.format(samples))
        # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below
        plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' +
                                        pca_mt.s.replace(" ", "_")).replace(
                                            "/", "_").key_cols_by('uid')
        hl.export_plink(plink_mt,
                        qc_temp_data_prefix('joint') + '.unrelated.plink',
                        fam_id=plink_mt.uid,
                        ind_id=plink_mt.uid)

        logger.info(
            'Computing population PCs and annotating with known population labels...'
        )
        pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
            pca_mt.GT, k=20, compute_loadings=True)
        pca_af_ht = pca_mt.annotate_rows(
            pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
        pca_loadings = pca_loadings.annotate(
            pca_af=pca_af_ht[pca_loadings.key].pca_af)
        pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite)
        pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite)

    pca_scores = hl.read_table(ancestry_pca_scores_ht_path())
    pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path())
    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    logger.info(
        'Projecting population PCs for {} related samples...'.format(samples))
    related_scores = pc_project(related_mt, pca_loadings)
    relateds = related_mt.cols()
    relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    logger.info('Assigning population annotations...')
    pop_colnames = ['related', 'known_pop', 'scores']
    pop_annots_ht = hl.import_table(known_population_annotations,
                                    impute=True).key_by('combined_sample')

    joint_ht = pca_mt.cols().union(relateds)
    joint_ht = joint_ht.annotate(
        known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' +
                                joint_ht.s.replace(' ', '_')].known_pop
    )  # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed
    joint_pca_ht = joint_ht.select(*pop_colnames)
    joint_pca_ht, joint_pca_fit = run_assign_population_pcs(
        joint_pca_ht,
        qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz',
        qc_temp_data_prefix('joint') + '.RF_fit.pkl',
        pcs=list(range(1, 7)))
    joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select(
        'pop', *pop_colnames)

    # Add special Estonian pop category for genomes
    estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate(
        data_type='genomes').key_by('data_type', 'sample'))
    joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch)
    joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when(
        hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1'
    ).when(hl.is_defined(joint_ht.pop)
           & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist()

    # These are keyed by only `s`
    genome_mt = get_gnomad_data('genomes',
                                adj=False,
                                split=False,
                                meta_root=None).select_cols()
    exome_mt = get_gnomad_data('exomes',
                               adj=False,
                               split=False,
                               meta_root=None).select_cols()

    # Population-specific filtering
    if not args.skip_calculate_sample_metrics:
        logger.info(
            'Running mini sample QC for platform- and population-specific filtering...'
        )
        gnomad_sample_qc(exome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite)
        gnomad_sample_qc(genome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite)
        # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet

    logger.info('Annotating population and platform assignments...')
    platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms'))
    exome_ht = exome_mt.cols()
    exome_ht = exome_ht.annotate(
        qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s])

    genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters'))
    genome_ht = genome_mt.cols()
    genome_ht = genome_ht.annotate(
        qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s])

    exome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('exomes') + '.sample_qc.ht')
    genome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('genomes') + '.sample_qc.ht')

    exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s])
    genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s])

    # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev
    logger.info(
        'Calculating platform- and population-specific sample QC thresholds...'
    )
    exome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    exome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform'])
    exome_ht = exome_ht.annotate_globals(
        hl.eval(exome_pop_platform_filter_ht.globals))
    exome_ht = exome_ht.annotate(
        **exome_pop_platform_filter_ht[exome_ht.key]).persist()

    genome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    genome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform'])
    genome_ht = genome_ht.annotate_globals(
        hl.eval(genome_pop_platform_filter_ht.globals))
    genome_ht = genome_ht.annotate(
        **genome_pop_platform_filter_ht[genome_ht.key]).persist()

    # Annotate samples that fail their respective filters
    checkpoint = exome_ht.aggregate(
        hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} exome samples found passing pop/platform-specific filtering'
    )
    exome_ht.key_by(data_type='exomes',
                    s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'),
                                        args.overwrite)

    checkpoint = genome_ht.aggregate(
        hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} genome samples found passing pop/platform-specific filtering'
    )
    genome_ht.key_by(data_type='genomes', s=genome_ht.s).write(
        qc_ht_path('genomes', 'pop_platform'), args.overwrite)