Exemplo n.º 1
0
def test_pcrelate_paths():
    mt = hl.balding_nichols_model(3, 50, 100)
    _, scores3, _ = hl.hwe_normalized_pca(mt.GT, k=3, compute_loadings=False)

    kin1 = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin', block_size=64)
    kin2 = hl.pc_relate(mt.GT,
                        0.05,
                        k=2,
                        min_kinship=0.01,
                        statistics='kin2',
                        block_size=128).cache()
    kin3 = hl.pc_relate(mt.GT,
                        0.02,
                        k=3,
                        min_kinship=0.1,
                        statistics='kin20',
                        block_size=64).cache()
    kin_s1 = hl.pc_relate(mt.GT,
                          0.10,
                          scores_expr=scores3[mt.col_key].scores[:2],
                          statistics='kin',
                          block_size=32)

    assert kin1._same(kin_s1, tolerance=1e-4)

    assert kin1.count() == 50 * 49 / 2

    assert kin2.count() > 0
    assert kin2.filter(kin2.kin < 0.01).count() == 0

    assert kin3.count() > 0
    assert kin3.filter(kin3.kin < 0.1).count() == 0
Exemplo n.º 2
0
def test_pcrelate_issue_5263():
    mt = hl.balding_nichols_model(3, 50, 100)
    expected = hl.pc_relate(mt.GT, 0.10, k=2, statistics='all')
    mt = mt.select_entries(GT2=mt.GT,
                           GT=hl.call(hl.rand_bool(0.5), hl.rand_bool(0.5)))
    actual = hl.pc_relate(mt.GT2, 0.10, k=2, statistics='all')
    assert expected._same(actual, tolerance=1e-4)
Exemplo n.º 3
0
def test_self_kinship():
    mt = hl.balding_nichols_model(3, 10, 50)
    with_self = hl.pc_relate(mt.GT,
                             0.10,
                             k=2,
                             statistics='kin20',
                             block_size=16,
                             include_self_kinship=True)
    without_self = hl.pc_relate(mt.GT,
                                0.10,
                                k=2,
                                statistics='kin20',
                                block_size=16)

    assert with_self.count() == 55
    assert without_self.count() == 45

    with_self_self_kin_only = with_self.filter(
        with_self.i.sample_idx == with_self.j.sample_idx)
    assert with_self_self_kin_only.count(
    ) == 10, with_self_self_kin_only.collect()

    with_self_no_self_kin = with_self.filter(
        with_self.i.sample_idx != with_self.j.sample_idx)
    assert with_self_no_self_kin.count() == 45, with_self_no_self_kin.collect()
    assert with_self_no_self_kin._same(without_self)

    without_self_self_kin_only = without_self.filter(
        without_self.i.sample_idx == without_self.j.sample_idx)
    assert without_self_self_kin_only.count(
    ) == 0, without_self_self_kin_only.collect()
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)

    # Perform kinship test with pc_relate
    pc_rel_path = output_path('pc_relate_kinship_estimate.ht')
    pc_rel = hl.pc_relate(mt.GT, 0.01, k=10, statistics='kin')
    pc_rel.write(pc_rel_path, overwrite=True)
    pairs = pc_rel.filter(pc_rel['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)
    n_related_samples = related_samples_to_remove.count()
    print(f'related_samples_to_remove.count() = {n_related_samples}')

    # save as html
    html = pd.DataFrame({
        'removed_individual':
        related_samples_to_remove.node.s.collect()
    }).to_html()
    plot_filename_html = output_path(f'removed_samples.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)
Exemplo n.º 5
0
def test_pc_relate_simple_example():
    gs = hl.literal(
        [[0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 1, 1, 0, 0, 1, 1],
         [0, 1, 0, 1, 0, 1, 0, 1],
         [0, 0, 1, 1, 0, 0, 1, 1]])
    scores = hl.literal([[0, 1], [1, 1], [1, 0], [0, 0]])
    mt = hl.utils.range_matrix_table(n_rows=8, n_cols=4)
    mt = mt.annotate_entries(GT=hl.unphased_diploid_gt_index_call(gs[mt.col_idx][mt.row_idx]))
    mt = mt.annotate_cols(scores=scores[mt.col_idx])
    pcr = hl.pc_relate(mt.GT, min_individual_maf=0, scores_expr=mt.scores)

    expected = [
        hl.Struct(i=0, j=1, kin=-0.14570713364640647,
                  ibd0=1.4823511628401964, ibd1=-0.38187379109476693, ibd2=-0.10047737174542953),
        hl.Struct(i=0, j=2, kin=0.16530591922102378,
                  ibd0=0.5234783206257841, ibd1=0.2918196818643366, ibd2=0.18470199750987923),
        hl.Struct(i=0, j=3, kin=-0.14570713364640647,
                  ibd0=1.4823511628401964, ibd1=-0.38187379109476693, ibd2=-0.10047737174542953),
        hl.Struct(i=1, j=2, kin=-0.14570713364640647,
                  ibd0=1.4823511628401964, ibd1=-0.38187379109476693, ibd2=-0.10047737174542953),
        hl.Struct(i=1, j=3, kin=0.14285714285714285,
                  ibd0=0.7027734170591313, ibd1=0.02302459445316596, ibd2=0.2742019884877027),
        hl.Struct(i=2, j=3, kin=-0.14570713364640647,
                  ibd0=1.4823511628401964, ibd1=-0.38187379109476693, ibd2=-0.10047737174542953),
    ]
    ht_expected = hl.Table.parallelize(expected)
    ht_expected = ht_expected.key_by(i=hl.struct(col_idx=ht_expected.i),
                                     j=hl.struct(col_idx=ht_expected.j))
    assert ht_expected._same(pcr)
Exemplo n.º 6
0
def test_pc_relate_against_R_truth():
    mt = hl.import_vcf(resource('pc_relate_bn_input.vcf.bgz'))
    hail_kin = hl.pc_relate(mt.GT, 0.00, k=2).checkpoint(
        utils.new_temp_file(extension='ht'))

    r_kin = hl.import_table(resource('pc_relate_r_truth.tsv.bgz'),
                            types={
                                'i': 'struct{s:str}',
                                'j': 'struct{s:str}',
                                'kin': 'float',
                                'ibd0': 'float',
                                'ibd1': 'float',
                                'ibd2': 'float'
                            },
                            key=['i', 'j'])
    assert r_kin.select("kin")._same(hail_kin.select("kin"),
                                     tolerance=1e-3,
                                     absolute=True)
    assert r_kin.select("ibd0")._same(hail_kin.select("ibd0"),
                                      tolerance=1.3e-2,
                                      absolute=True)
    assert r_kin.select("ibd1")._same(hail_kin.select("ibd1"),
                                      tolerance=2.6e-2,
                                      absolute=True)
    assert r_kin.select("ibd2")._same(hail_kin.select("ibd2"),
                                      tolerance=1.3e-2,
                                      absolute=True)
Exemplo n.º 7
0
    def test_pcrelate(self):
        dataset = hl.balding_nichols_model(3, 100, 100)
        dataset = dataset.annotate_cols(sample_idx = hl.str(dataset.sample_idx))
        t = hl.pc_relate(dataset, 2, 0.05, block_size=64, statistics="phi")

        self.assertTrue(isinstance(t, hl.Table))
        t.count()
Exemplo n.º 8
0
def run_pc_relate(mt: hl.MatrixTable,
                  pca_prefix: str,
                  overwrite: bool = False):
    """
    Runs PC-relate to identify relatives in a matrix table
    :param mt: Matrix table to run PC-relate on
    :param pca_prefix: Prefix to path to output relatedness information
    :param overwrite: if True, overwrites existing data
    :return:
    """
    relatedness_ht = hl.pc_relate(mt.GT,
                                  min_individual_maf=0.05,
                                  min_kinship=0.05,
                                  statistics='kin',
                                  k=20).key_by()
    relatedness_ht.write(pca_prefix + 'relatedness.ht', args.overwrite)
    relatedness_ht = hl.read_table(pca_prefix + 'relatedness.ht')

    # identify individuals in pairs to remove
    related_samples_to_remove = hl.maximal_independent_set(
        relatedness_ht.i, relatedness_ht.j, False)
    mt_unrel = mt.filter_cols(hl.is_defined(
        related_samples_to_remove[mt.col_key]),
                              keep=False)
    mt_rel = mt.filter_cols(hl.is_defined(
        related_samples_to_remove[mt.col_key]),
                            keep=True)

    mt_unrel.write(pca_prefix + 'unrel.mt', args.overwrite)
    mt_rel.write(pca_prefix + 'rel.mt', args.overwrite)
Exemplo n.º 9
0
def compute_relatedness(
    data_type: str = "genomes",
    overwrite: bool = False,
) -> hl.Table:
    """
    Perform sample QC on the split VDS table using `compute_stratified_sample_qc`.
    :param data_type: Whether data is from genomes or exomes, default is genomes
    :param overwrite: Whether to overwrite the file
    :return: Table table after running pc_relate
    :rtype: hl.Table
    """
    logger.info("Computing relatedness table on CCDG %s VDS", data_type)
    pca_var_ht = hl.read_table(get_pca_variants_path())
    mt = hl.vds.to_dense_mt(get_qc_vds(data_type))
    mt = mt.filter_rows(hl.is_defined(pca_var_ht[mt.row_key]))
    eig, scores, _ = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=False)
    scores = scores.checkpoint(
        get_ccdg_results_path(data_type=data_type, result="pc_scores"),
        overwrite=overwrite,
        _read_if_exists=not overwrite,
    )
    relatedness_ht = hl.pc_relate(
        mt.GT,
        min_individual_maf=0.01,
        scores_expr=scores[mt.col_key].scores,
        block_size=4096,
        min_kinship=0.05,
        statistics="all",
    )
    return relatedness_ht.checkpoint(
        get_ccdg_results_path(data_type=data_type, result="relatedness"),
        overwrite=overwrite,
        _read_if_exists=(not overwrite),
    )
Exemplo n.º 10
0
def pc_relate_5k_5k(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt = mt.annotate_cols(scores=hl.range(2).map(lambda x: hl.rand_unif(0, 1)))
    rel = hl.pc_relate(mt.GT,
                       0.05,
                       scores_expr=mt.scores,
                       statistics='kin',
                       min_kinship=0.05)
    rel._force_count()
Exemplo n.º 11
0
def pc_relate_big():
    mt = hl.balding_nichols_model(3, 2 * 4096, 2 * 4096).checkpoint(
        hl.utils.new_temp_file(extension='mt'))
    mt = mt.annotate_cols(scores=hl.range(2).map(lambda x: hl.rand_unif(0, 1)))
    rel = hl.pc_relate(mt.GT,
                       0.05,
                       scores_expr=mt.scores,
                       statistics='kin',
                       min_kinship=0.05)
    rel._force_count()
Exemplo n.º 12
0
def main(args):
    if args.join_qc_mt:
        v2_qc_mt_liftover = get_liftover_v2_qc_mt('exomes', ld_pruned=True, release_only=True)
        v2_qc_mt_liftover = v2_qc_mt_liftover.key_cols_by(s=v2_qc_mt_liftover.s, data_type="v2_exomes")
        v3_qc_mt = qc.mt()
        v3_qc_mt = v3_qc_mt.filter_cols(meta.ht()[v3_qc_mt.col_key].release)
        v3_qc_mt = v3_qc_mt.select_rows().select_cols()
        v3_qc_mt = v3_qc_mt.key_cols_by(s=v3_qc_mt.s, data_type="v3_genomes")
        joint_qc_mt = v2_qc_mt_liftover.union_cols(v3_qc_mt)
        joint_qc_mt.write("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt", overwrite=args.overwrite)

    if args.run_pc_relate:
        logger.info('Running PC-Relate')
        logger.warning("PC-relate requires SSDs and doesn't work with preemptible workers!")
        joint_qc_mt = hl.read_matrix_table("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt")
        joint_qc_mt = joint_qc_mt.sample_rows(0.1)
        eig, scores, _ = hl.hwe_normalized_pca(joint_qc_mt.GT, k=10, compute_loadings=False)
        scores = scores.checkpoint(v2_v3_pc_relate_pca_scores.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite)
        relatedness_ht = hl.pc_relate(joint_qc_mt.GT, min_individual_maf=0.01, scores_expr=scores[joint_qc_mt.col_key].scores,
                                      block_size=4096, min_kinship=0.1, statistics='all')
        relatedness_ht.write(v2_v3_relatedness.path, args.overwrite)
Exemplo n.º 13
0
def main(args):
    hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/')

    if not args.load_joint_pruned_qc_mt:
        logger.info('Joining exomes and genomes...')
        exome_qc_mt = read_and_pre_process_data(
            qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters'))
        genome_qc_mt = read_and_pre_process_data(
            qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters'))

        joint_qc_mt = exome_qc_mt.union_cols(
            genome_qc_mt)  # NOTE: this is an inner join on rows
        joint_qc_mt = joint_qc_mt.filter_rows(
            (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99))
        joint_qc_mt.write(qc_mt_path('joint'), args.overwrite)

        logger.info('LD-pruning joint mt of exomes and genomes...')
        joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint'))
        variants, samples = joint_qc_mt.count()
        logger.info('Pruning {0} variants in {1} samples'.format(
            variants, samples))
        joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1)
        # Note writing the LD-pruned MT is probably overkill
        # vs using `filter_rows` to filter sites based on the LD-pruned HT.
        joint_qc_pruned_mt = joint_qc_mt.filter_rows(
            hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key]))
        joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True),
                                 args.overwrite)

    pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True))
    variants, samples = pruned_mt.count()
    logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format(
        samples, variants))

    if not args.skip_pc_relate:
        logger.info('Running PCA for PC-Relate...')
        eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht',
            args.overwrite)

        logger.info('Running PC-Relate...')
        scores = hl.read_table(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht')
        # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes
        relatedness_ht = hl.pc_relate(
            pruned_mt.GT,
            min_individual_maf=0.05,
            scores_expr=scores[pruned_mt.col_key].scores,
            block_size=4096,
            min_kinship=0.05,
            statistics='kin2')
        relatedness_ht.write(relatedness_ht_path, args.overwrite)

    relatedness_ht = hl.read_table(relatedness_ht_path)

    if not args.skip_relatedness:
        infer_ped(GnomADRelatedData('exomes'))
        infer_ped(GnomADRelatedData('genomes'))

        logger.info('Making rank file...')
        rank_table = make_rank_file(rank_annotations_path('joint'))
        logger.info('Finished making rank file...')

        related_samples_to_drop_ranked = get_related_samples_to_drop(
            rank_table, relatedness_ht)
        related_samples_to_drop_ranked.write(
            qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht',
            args.overwrite)

    pca_mt, related_mt = split_mt_by_relatedness(pruned_mt)

    if not args.skip_pop_pca:
        variants, samples = pca_mt.count()
        logger.info('{} samples after removing relateds'.format(samples))
        # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below
        plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' +
                                        pca_mt.s.replace(" ", "_")).replace(
                                            "/", "_").key_cols_by('uid')
        hl.export_plink(plink_mt,
                        qc_temp_data_prefix('joint') + '.unrelated.plink',
                        fam_id=plink_mt.uid,
                        ind_id=plink_mt.uid)

        logger.info(
            'Computing population PCs and annotating with known population labels...'
        )
        pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
            pca_mt.GT, k=20, compute_loadings=True)
        pca_af_ht = pca_mt.annotate_rows(
            pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
        pca_loadings = pca_loadings.annotate(
            pca_af=pca_af_ht[pca_loadings.key].pca_af)
        pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite)
        pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite)

    pca_scores = hl.read_table(ancestry_pca_scores_ht_path())
    pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path())
    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    logger.info(
        'Projecting population PCs for {} related samples...'.format(samples))
    related_scores = pc_project(related_mt, pca_loadings)
    relateds = related_mt.cols()
    relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    logger.info('Assigning population annotations...')
    pop_colnames = ['related', 'known_pop', 'scores']
    pop_annots_ht = hl.import_table(known_population_annotations,
                                    impute=True).key_by('combined_sample')

    joint_ht = pca_mt.cols().union(relateds)
    joint_ht = joint_ht.annotate(
        known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' +
                                joint_ht.s.replace(' ', '_')].known_pop
    )  # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed
    joint_pca_ht = joint_ht.select(*pop_colnames)
    joint_pca_ht, joint_pca_fit = run_assign_population_pcs(
        joint_pca_ht,
        qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz',
        qc_temp_data_prefix('joint') + '.RF_fit.pkl',
        pcs=list(range(1, 7)))
    joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select(
        'pop', *pop_colnames)

    # Add special Estonian pop category for genomes
    estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate(
        data_type='genomes').key_by('data_type', 'sample'))
    joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch)
    joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when(
        hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1'
    ).when(hl.is_defined(joint_ht.pop)
           & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist()

    # These are keyed by only `s`
    genome_mt = get_gnomad_data('genomes',
                                adj=False,
                                split=False,
                                meta_root=None).select_cols()
    exome_mt = get_gnomad_data('exomes',
                               adj=False,
                               split=False,
                               meta_root=None).select_cols()

    # Population-specific filtering
    if not args.skip_calculate_sample_metrics:
        logger.info(
            'Running mini sample QC for platform- and population-specific filtering...'
        )
        gnomad_sample_qc(exome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite)
        gnomad_sample_qc(genome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite)
        # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet

    logger.info('Annotating population and platform assignments...')
    platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms'))
    exome_ht = exome_mt.cols()
    exome_ht = exome_ht.annotate(
        qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s])

    genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters'))
    genome_ht = genome_mt.cols()
    genome_ht = genome_ht.annotate(
        qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s])

    exome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('exomes') + '.sample_qc.ht')
    genome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('genomes') + '.sample_qc.ht')

    exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s])
    genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s])

    # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev
    logger.info(
        'Calculating platform- and population-specific sample QC thresholds...'
    )
    exome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    exome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform'])
    exome_ht = exome_ht.annotate_globals(
        hl.eval(exome_pop_platform_filter_ht.globals))
    exome_ht = exome_ht.annotate(
        **exome_pop_platform_filter_ht[exome_ht.key]).persist()

    genome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    genome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform'])
    genome_ht = genome_ht.annotate_globals(
        hl.eval(genome_pop_platform_filter_ht.globals))
    genome_ht = genome_ht.annotate(
        **genome_pop_platform_filter_ht[genome_ht.key]).persist()

    # Annotate samples that fail their respective filters
    checkpoint = exome_ht.aggregate(
        hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} exome samples found passing pop/platform-specific filtering'
    )
    exome_ht.key_by(data_type='exomes',
                    s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'),
                                        args.overwrite)

    checkpoint = genome_ht.aggregate(
        hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} genome samples found passing pop/platform-specific filtering'
    )
    genome_ht.key_by(data_type='genomes', s=genome_ht.s).write(
        qc_ht_path('genomes', 'pop_platform'), args.overwrite)
Exemplo n.º 14
0
mt_rows = mt_rows.annotate(var_cr_flag=mt_rows.var_cr_flag_1)
mt_rows = mt_rows.drop(mt_rows.var_cr_flag_1)
mt_rows = mt_rows.drop(mt_rows.var_cr_flag_2)

var_cr_counts = mt_rows.aggregate(
    hl.agg.array_agg(lambda x: hl.agg.counter(x), mt_rows.var_cr_flag))

maf_counts = mt_rows.aggregate(
    hl.agg.array_agg(lambda x: hl.agg.counter(x), mt_rows.maf_flag))

hwe_counts = mt_rows.aggregate(
    hl.agg.array_agg(lambda x: hl.agg.counter(x), mt_rows.hwe_pval_flag))

# Calculates relatedness using pc_relate for all samples in a matrix table
# Annotates a column which flags those who failed the relatedness filter as True
pc_rel = hl.pc_relate(mt_auto.GT, 0.001, k=10, statistics='kin')
pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
related_samples_to_remove = hl.maximal_independent_set(pairs.i,
                                                       pairs.j,
                                                       keep=False)
mt_auto = mt_auto.annotate_cols(
    related_filter=hl.is_defined(related_samples_to_remove[mt_auto.col_key]))
'''
--- Conducting QC ---

QC Steps:
    snp call rate
    sample call rate
    sex violations
    maf
    hwe
Exemplo n.º 15
0
def relatedness_check(in_mt: hl.MatrixTable = None,
                      method: str = 'pc_relate',
                      outdir: str = None,
                      kin_estimate: float = 0.98):

    global mt, samples_to_remove

    in_mt = hl.variant_qc(in_mt)
    in_mt = hl.sample_qc(in_mt)

    # _localize=False means don't put this in Python, keep it as a Hail expr
    call_rate_dict = in_mt.aggregate_cols(hl.dict(
        hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))),
                                          _localize=False)

    if method == 'pc_relate':
        print("\nUsing PC-Relate for relatedness checks")
        relatedness_ht = hl.pc_relate(in_mt.GT,
                                      0.01,
                                      k=10,
                                      min_kinship=0.1,
                                      statistics='kin')
        samples_to_remove_ht = relatedness_ht.filter(
            relatedness_ht.kin > kin_estimate)

        # get call rates for both samples so we remove the one with lower call rate between the two
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.i.s],
            cr_s2=call_rate_dict[samples_to_remove_ht.j.s])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.i, samples_to_remove.j))

    elif method == 'ibd':
        print("\nUsing PLINK-style identity by descent for relatedness checks")
        in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF))
        relatedness_ht = hl.identity_by_descent(
            in_mt, maf=in_mt['maf']
        )  # this returns a Hail Table with the sample pairs
        samples_to_remove_ht = relatedness_ht.filter(
            relatedness_ht.ibd.PI_HAT > kin_estimate)

        # get call rates for both samples so we remove the one with lower call rate between the two
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.i],
            cr_s2=call_rate_dict[samples_to_remove_ht.j])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.i, samples_to_remove.j))

    else:
        print("\nUsing KING for relatedness checks")
        if kin_estimate > 0.5:
            raise Exception(
                "\nThe maximum kinship coefficient is for KING 0.5")
        relatedness_mt = hl.king(in_mt.GT)
        filtered_relatedness_mt = relatedness_mt.filter_entries(
            (relatedness_mt.s_1 != relatedness_mt.s) &
            (relatedness_mt.phi >= kin_estimate),
            keep=True)
        samples_to_remove_ht = filtered_relatedness_mt.entries()
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.s_1],
            cr_s2=call_rate_dict[samples_to_remove_ht.s])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.s_1, samples_to_remove.s))

    samples = samples_list.sample_to_remove.collect()

    if len(samples) > 0:
        in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']),
                                  keep=False)
        print("\nNumber of samples that fail relatedness checks: {}".format(
            len(samples)))
        with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f:
            for sample in samples:
                f.write(sample + "\n")

    else:
        print("\nNo samples failed the relatedness check")

    return in_mt
def main(args):
    mt = hl.read_matrix_table(args.matrixtable)
    # ld pruning
    pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key]))
    pruned_mt.write(f"{args.output_dir}/mt_ldpruned.mt", overwrite=True)

    # PC relate
    pruned_mt = pruned_mt.select_entries(
        GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles()))

    eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                           k=10,
                                           compute_loadings=False)
    scores.write(f"{args.output_dir}/mt_pruned.pca_scores.ht", overwrite=True)

    relatedness_ht = hl.pc_relate(pruned_mt.GT,
                                  min_individual_maf=0.05,
                                  scores_expr=scores[pruned_mt.col_key].scores,
                                  block_size=4096,
                                  min_kinship=0.05,
                                  statistics='kin2')
    relatedness_ht.write(f"{args.output_dir}/mt_relatedness.ht",
                         overwrite=True)
    pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125)
    related_samples_to_remove = hl.maximal_independent_set(pairs.i,
                                                           pairs.j,
                                                           keep=False)
    related_samples_to_remove.write(
        f"{args.output_dir}/mt_related_samples_to_remove.ht", overwrite=True)

    pca_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                   keep=False)
    related_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                       keep=True)

    variants, samples = pca_mt.count()

    print(f"{samples} samples after relatedness step.")

    # Population pca

    plink_mt = pca_mt.annotate_cols(uid=pca_mt.s).key_cols_by('uid')
    hl.export_plink(plink_mt,
                    f"{args.output_dir}/mt_unrelated.plink",
                    fam_id=plink_mt.uid,
                    ind_id=plink_mt.uid)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pca_mt.GT, k=20, compute_loadings=True)
    pca_af_ht = pca_mt.annotate_rows(
        pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_af_ht[pca_loadings.key].pca_af)
    pca_scores.write(f"{args.output_dir}/mt_pca_scores.ht", overwrite=True)
    pca_loadings.write(f"{args.output_dir}/mt_pca_loadings.ht", overwrite=True)

    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    print(
        'Projecting population PCs for {} related samples...'.format(samples))
    #related_scores = pc_project(related_mt, pca_loadings)
    #relateds = related_mt.cols()
    #relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True)
    p = hl.plot.scatter(pca_mt.scores[0],
                        pca_mt.scores[1],
                        title='PCA',
                        xlabel='PC1',
                        ylabel='PC2')
    output_file(f"{args.plot_dir}/pca.html")
    save(p)
Exemplo n.º 17
0
def main(args):
    if args.load_ref:
        load_ref(args.dirname, args.basename)

    if args.load_ukbb:
        samples = hl.read_table(
            'gs://ukb-diverse-pops/pigmentation_phenos_covs_pops.ht')
        ukbb = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt')
        ukbb = ukbb.annotate_cols(**samples[ukbb.s])

    if args.intersect_ref:
        intersect_ref(args.dirname, args.basename, ukbb)

    if args.pca_project:
        """
        Compute PCA in global reference panel, project UKBB individuals into PCA space
        """
        ref_in_ukbb = hl.read_matrix_table(args.dirname + 'intersect_' +
                                           args.basename + 'ukbb.mt')
        print('Computing reference PCs')
        run_pca(ref_in_ukbb, args.out_prefix + args.basename + '_ukbb_')

        # project ukbb
        pca_loadings = hl.read_table(
            f'{args.out_prefix}{args.basename}_ukbb_loadings.ht')
        project_mt = hl.read_matrix_table(args.dirname + 'intersect_ukbb_' +
                                          args.basename + '.mt')
        ht = project_individuals(pca_loadings, project_mt)
        ht.export(args.out_prefix + 'ukbb_' + args.basename +
                  '_scores.txt.bgz')

    # if args.continental_pca:
    #     """
    #     Compute PCA within reference panel super pops, project UKBB individuals into PCA space
    #     1. Filter UKBB to individuals in continental population
    #     2. Run PCA on continental ref
    #     3. Project UKBB inds
    #     """
    #     pass

    if args.ukbb_pop_pca:
        """
        Compute PCA in each UKBB population (unrelateds), project reference individuals and relateds into PCA space
        1. Filter UKBB to individuals in continental population
        2. Run PC-relate on these individuals
        # New
        2.5 Filter to pruned set of individuals
        #
        3. Filter UKBB population to unrelated individuals
        4. Run PCA on UKBB unrelateds within population
        5. Project relateds
        """

        for pop in POPS:
            mt = hl.read_matrix_table(get_ukb_grm_mt_path(pop))
            pruned_ht = hl.read_table(get_ukb_grm_pruned_ht_path(pop))
            mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key]))

            # run PC-relate
            if args.overwrite or not hl.hadoop_exists(
                    get_relatedness_path(pop,
                                         extension='all_scores.ht/_SUCCESS')):
                _, scores, _ = hl.hwe_normalized_pca(mt.GT,
                                                     k=10,
                                                     compute_loadings=False)
                scores.write(
                    get_relatedness_path(pop, extension='all_scores.ht'),
                    args.overwrite)
            scores = hl.read_table(
                get_relatedness_path(pop, extension='all_scores.ht'))
            mt = mt.annotate_cols(scores=scores[mt.col_key].scores)
            # For EUR, required highmem machines with SSDs (Needed ~6T of hdfs space, so 20 workers + 100 pre-emptibles ran in ~7 hours)
            relatedness_ht = hl.pc_relate(
                mt.GT,
                min_individual_maf=0.05,
                scores_expr=mt.scores,
                min_kinship=0.05,
                statistics='kin',
                block_size=4096 if pop == 'EUR' else 512).key_by()
            relatedness_ht.write(get_relatedness_path(pop, extension='ht'),
                                 args.overwrite)
            relatedness_ht = hl.read_table(
                get_relatedness_path(pop, extension='ht'))

            # identify individuals in pairs to remove
            related_samples_to_remove = hl.maximal_independent_set(
                relatedness_ht.i, relatedness_ht.j, False)
            mt_unrel = mt.filter_cols(hl.is_defined(
                related_samples_to_remove[mt.col_key]),
                                      keep=False)
            mt_rel = mt.filter_cols(hl.is_defined(
                related_samples_to_remove[mt.col_key]),
                                    keep=True)

            mt_unrel.write(get_relatedness_path(pop, True, 'mt'),
                           args.overwrite)
            mt_rel.write(get_relatedness_path(pop, extension='mt'),
                         args.overwrite)

    if args.ukb_prune_pca_project:
        for pop in POPS:
            mt_unrel = hl.read_matrix_table(
                get_relatedness_path(pop, True, 'mt'))
            mt_rel = hl.read_matrix_table(
                get_relatedness_path(pop, extension='mt'))

            # Removing individuals
            pruned_inds = hl.import_table(get_pruned_tsv_path(), key='s')
            mt_rel = mt_rel.filter_cols(
                hl.is_defined(pruned_inds[mt_rel.col_key]))
            mt_unrel = mt_unrel.filter_cols(
                hl.is_defined(pruned_inds[mt_unrel.col_key]))

            # Removing sites
            window = '1e6' if pop != 'EUR' else '1e7'
            pruned_ht = hl.read_table(get_ukb_grm_pruned_ht_path(pop, window))
            mt_unrel = mt_unrel.filter_rows(
                hl.is_defined(pruned_ht[mt_unrel.row_key]))

            mt_unrel = mt_unrel.repartition(500).checkpoint(
                hl.utils.new_temp_file())

            pop = pop if window == '1e6' else f'{pop}_{window}'
            run_pca(
                mt_unrel,
                get_relatedness_path(pop, unrelated=True, extension='') + '.',
                args.overwrite)
            pca_loadings = hl.read_table(
                get_relatedness_path(pop,
                                     unrelated=True,
                                     extension='loadings.ht'))
            ht = project_individuals(pca_loadings, mt_rel)
            ht.write(
                get_relatedness_path(pop, extension='scores_projected.ht'),
                args.overwrite)
            hl.read_table(
                get_relatedness_path(
                    pop, extension='scores_projected.ht')).export(
                        get_relatedness_path(
                            pop, extension='scores_projected.txt.bgz'))

    if args.generate_covariates:
        hts = []
        for pop in POPS:
            pop_path = pop if pop != 'EUR' else f'EUR_1e7'
            ht = hl.read_table(
                get_relatedness_path(pop_path,
                                     extension='scores_projected.ht'))
            hts.append(ht.annotate(pop=pop, related=True))
            ht = hl.read_table(
                get_relatedness_path(pop_path, True, extension='scores.ht'))
            ht = ht.transmute(
                **{f'PC{i}': ht.scores[i - 1]
                   for i in range(1, 21)})
            hts.append(ht.annotate(pop=pop, related=False))

        ht = hts[0].union(*hts[1:])
        cov_ht = hl.import_table(get_age_sex_tsv_path(),
                                 impute=True,
                                 force=True,
                                 quote='"',
                                 key='userId').select('age', 'sex')
        cov_ht = cov_ht.annotate(age_sex=cov_ht.age * cov_ht.sex,
                                 age2=hl.int32(cov_ht.age**2),
                                 age2_sex=hl.int32(cov_ht.age**2) * cov_ht.sex)
        ht = ht.annotate(**cov_ht.key_by(userId=hl.str(cov_ht.userId))[ht.key])
        ht.write(get_covariates_ht_path(), args.overwrite)

    get_filtered_mt(imputed=False).cols().export(get_final_sample_set())
# Preparing for PCA
for_pca = filter_to_autosomes(mt)
for_pca = for_pca.filter_rows(for_pca.n_alleles == 2)

# Performing the PCA
sample_num = for_pca.cols().count()

_, scores, _ = hl.hwe_normalized_pca(
    for_pca.GT, k=max(1, min(sample_num // 3, 10)), compute_loadings=False
)

relatedness_ht = hl.pc_relate(
    for_pca.GT,
    min_individual_maf=0.01,
    scores_expr=scores[for_pca.col_key].scores,
    block_size=4096,
    min_kinship=0.05,
    statistics="kin",
)

pairs = relatedness_ht.filter(relatedness_ht["kin"] > RELATEDNESS)

related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False)

mt = mt.filter_cols(hl.is_defined(related_samples_to_remove[mt.col_key]), keep=False)

# Wrapping up: saving relatednsess Table and dataset MatrixTable to disk
relatedness_ht.write("relatedness.ht", overwrite=True)

mt.write("sampleqc_pass.mt", overwrite=True)
Exemplo n.º 19
0
def main(args):

    # Init Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_compute_pc_relate:

        if not args.skip_filter_data:
            # Read MatrixTable
            mt = hl.read_matrix_table(args.mt_input_path)

            # filter variants (bi-allelic, high-callrate, common SNPs)
            logger.info(
                f"Filtering to bi-allelic, high-callrate, common SNPs ({args.maf_threshold}) for pc_relate..."
            )

            mt = (mt.filter_rows(
                (hl.len(mt.alleles) == 2)
                & hl.is_snp(mt.alleles[0], mt.alleles[1])
                & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > args.maf_threshold)
                & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)
                & ~mt.was_split).repartition(500, shuffle=False))

            # keep only GT entry field and force to evaluate expression
            (mt.select_entries(mt.GT).write(
                f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt',
                overwrite=args.overwrite))

        mt = hl.read_matrix_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt'
        )

        if not args.skip_prune_ld:
            # LD pruning
            # Avoid filtering / missingness entries (genotypes) before run LP pruning
            # Zulip Hail support issue -> "BlockMatrix trouble when running pc_relate"
            # mt = mt.unfilter_entries()

            # Prune variants in linkage disequilibrium.
            # Return a table with nearly uncorrelated variants

            logger.info(
                f'Pruning variants in LD from MT with {mt.count_rows()} variants...'
            )

            pruned_variant_table = hl.ld_prune(mt.GT, r2=args.r2)

            # Keep LD-pruned variants
            pruned_mt = (mt.filter_rows(hl.is_defined(
                pruned_variant_table[mt.row_key]),
                                        keep=True))
            pruned_mt.write(
                f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt',
                overwrite=args.overwrite)

        pruned_mt = hl.read_matrix_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt')
        v, s = pruned_mt.count()
        logger.info(f'{s} samples, {v} variants found in LD-pruned MT')

        pruned_mt = pruned_mt.select_entries(
            GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles()))

        # run pc_relate method...compute all stats
        logger.info('Running PCA for PC-Relate...')
        eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht',
            overwrite=args.overwrite)

        logger.info(f'Running PC-Relate...')
        scores = hl.read_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht'
        )
        relatedness_ht = hl.pc_relate(
            call_expr=pruned_mt.GT,
            min_individual_maf=args.min_individual_maf,
            scores_expr=scores[pruned_mt.col_key].scores,
            block_size=4096,
            min_kinship=args.min_kinship,
            statistics='all')

        logger.info(f'Writing relatedness table...')
        # Write/export table to file
        relatedness_ht.write(
            output=
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht',
            overwrite=args.overwrite)

        # Write PCs table to file (if specified)
        # if args.write_to_file:
        #    # Export table to file
        #    relatedness_ht.export(output=f'{args.ht_output_path}.tsv.bgz')

    # retrieve maximal independent set of related samples
    logger.info('Getting optimal set of related samples to prune...')

    relatedness_ht = hl.read_table(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht')

    relatedness_ht = (relatedness_ht.flatten().rename({
        'i.s': 'i',
        'j.s': 'j'
    }).repartition(100))

    # import trios info
    fam = import_fam_ht()
    mat_ids = hl.set(fam.mat_id.collect())
    fat_ids = hl.set(fam.pat_id.collect())

    # rank samples by retention priority (e.g. cases over controls)
    tb_rank = make_sample_rank_table(get_sample_meta_data())

    # apply min kinship to consider related pairs
    relatedness_ht = (relatedness_ht.filter(relatedness_ht.kin > MIN_KINSHIP))

    # run maximal_independent_set stratified by groups
    # Note: This method fails when considering all pairs together (e.g. it removes most of the index in trios, we want
    # keep them (index) since they are mostly affected individuals rather than parents).

    # defining pairs group
    # TODO: check groups with updated fam file
    relatedness_ht = (relatedness_ht.annotate(pairs_group=hl.case().when(
        relatedness_ht.kin > 0.40, 'twins_or_dups').when(
            mat_ids.contains(relatedness_ht.i)
            | mat_ids.contains(relatedness_ht.j), 'pairs_child_mat').when(
                fat_ids.contains(relatedness_ht.i)
                | fat_ids.contains(relatedness_ht.j),
                'pairs_child_fat').default('pairs_others')))

    groups = (relatedness_ht.aggregate(
        hl.agg.collect_as_set(relatedness_ht['pairs_group'])))
    tbs = []
    for pair_group in groups:
        pair_ht = relatedness_ht.filter(
            relatedness_ht.pairs_group == pair_group)
        tb = get_related_samples_to_drop(rank_table=tb_rank,
                                         relatedness_ht=pair_ht)
        tbs.append(tb)

    related_samples_to_remove = hl.Table.union(*tbs)

    related_samples_to_remove.describe()

    related_samples_to_remove = related_samples_to_remove.checkpoint(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.ht',
        overwrite=args.overwrite)

    if args.write_to_file:
        (related_samples_to_remove.flatten().export(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.tsv'
        ))

    hl.stop()
Exemplo n.º 20
0
def main(args):
    global output_prefix
    output_prefix = args.output_dir.rstrip("/") + "/" + splitext(
        basename(args.input_mt))[0]

    if args.compute_qc_mt:
        qc_mt = get_qc_mt(hl.read_matrix_table(args.input_mt))
        qc_mt = qc_mt.repartition(n_partitions=200)
        qc_mt.write(path('qc.mt'), overwrite=args.overwrite)

    if args.compute_qc_metrics:
        logger.info("Computing sample QC")
        mt = filter_to_autosomes(hl.read_matrix_table(args.input_mt))
        strats = {
            'bi_allelic': bi_allelic_expr(mt),
            'multi_allelic': ~bi_allelic_expr(mt)
        }
        for strat, filter_expr in strats.items():
            strat_sample_qc_ht = hl.sample_qc(
                mt.filter_rows(filter_expr)).cols()
            strat_sample_qc_ht.write(path(f'{strat}_sample_qc.ht'),
                                     overwrite=args.overwrite)
        strat_hts = [
            hl.read_table(path(f'{strat}_sample_qc.ht')) for strat in strats
        ]
        sample_qc_ht = strat_hts.pop()
        sample_qc_ht = sample_qc_ht.select(
            sample_qc=merge_sample_qc_expr([sample_qc_ht.sample_qc] + [
                strat_hts[i][sample_qc_ht.key].sample_qc
                for i in range(0, len(strat_hts))
            ]))
        sample_qc_ht.write(path('sample_qc.ht'), overwrite=args.overwrite)

    if args.compute_callrate_mt:
        callrate_mt = compute_callrate_mt(
            hl.read_matrix_table(args.input_mt),
            hl.import_locus_intervals(exome_calling_intervals_path))
        callrate_mt.write(path('callrate.mt'), args.overwrite)

    if args.run_platform_pca:
        eigenvalues, scores_ht, loadings_ht = run_platform_pca(
            hl.read_matrix_table(path('callrate.mt')))
        scores_ht.write(path('platform_pca_scores.ht'),
                        overwrite=args.overwrite)
        loadings_ht.write(path('platform_pca_loadings.ht'),
                          overwrite=args.overwrite)

    if args.assign_platforms:
        platform_ht = assign_platform_from_pcs(
            hl.read_table(path('platform_pca_scores.ht')),
            hdbscan_min_cluster_size=args.hdbscan_min_cluster_size,
            hdbscan_min_samples=args.hdbscan_min_samples)
        platform_ht.write(f'{output_prefix}.platform_pca_results.ht',
                          overwrite=args.overwrite)

    if args.impute_sex:
        sex_ht = infer_sex(hl.read_matrix_table(path('qc.mt')),
                           hl.read_matrix_table(args.input_mt),
                           hl.read_table(path('platform_pca_results.ht')),
                           args.male_threshold, args.female_threshold,
                           args.min_male_y_sites_called,
                           args.max_y_female_call_rate,
                           args.min_y_male_call_rate)
        sex_ht.write(path('sex.ht'), overwrite=args.overwrite)

    if args.run_pc_relate:
        logger.info('Running PCA for PC-Relate')
        qc_mt = hl.read_matrix_table(path('qc.mt')).unfilter_entries()
        eig, scores, _ = hl.hwe_normalized_pca(qc_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(path('pruned.pca_scores.ht'), args.overwrite)

        logger.info('Running PC-Relate')
        logger.warn(
            "PC-relate requires SSDs and doesn't work with preemptible workers!"
        )
        scores = hl.read_table(path('pruned.pca_scores.ht'))
        relatedness_ht = hl.pc_relate(qc_mt.GT,
                                      min_individual_maf=0.05,
                                      scores_expr=scores[qc_mt.col_key].scores,
                                      block_size=4096,
                                      min_kinship=args.min_emission_kinship,
                                      statistics='all')
        relatedness_ht.write(path('relatedness.ht'), args.overwrite)

    if args.filter_dups:
        logger.info("Filtering duplicate samples")
        sample_qc_ht = hl.read_table(path('sample_qc.ht'))
        samples_rankings_ht = sample_qc_ht.select(
            rank=-1 * sample_qc_ht.sample_qc.dp_stats.mean)
        dups_ht = filter_duplicate_samples(
            hl.read_table(path('relatedness.ht')), samples_rankings_ht)
        dups_ht.write(path('duplicates.ht'), overwrite=args.overwrite)

    if args.infer_families:
        logger.info("Inferring families")
        duplicates_ht = hl.read_table(path('duplicates.ht'))
        dups_to_remove = duplicates_ht.aggregate(
            hl.agg.explode(lambda x: hl.agg.collect_as_set(x.s),
                           duplicates_ht.filtered))
        ped = infer_families(hl.read_table(path('relatedness.ht')),
                             hl.read_table(path('sex.ht')), dups_to_remove)
        ped.write(path('pedigree.ped'))

    if args.filter_related_samples:
        logger.info("Filtering related samples")
        related_pairs_ht, related_pairs_tie_breaker = rank_related_samples(
            hl.read_table(path('relatedness.ht')), hl.read_table(args.meta),
            hl.read_table(path('sample_qc.ht')),
            hl.import_fam(path('pedigree.ped'), delimiter="\t"))

        related_samples_to_drop_ht = hl.maximal_independent_set(
            related_pairs_ht.i,
            related_pairs_ht.j,
            keep=False,
            tie_breaker=related_pairs_tie_breaker)
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by()
        related_samples_to_drop_ht = related_samples_to_drop_ht.select(
            **related_samples_to_drop_ht.node)
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by('s')
        related_samples_to_drop_ht.write(path('related_samples_to_drop.ht'),
                                         overwrite=args.overwrite)

    if args.run_pca:
        logger.info("Running population PCA")
        pca_evals, pop_pca_scores_ht, pop_pca_loadings_ht = run_pca_with_relateds(
            hl.read_matrix_table(path('qc.mt')),
            hl.read_table(path('related_samples_to_drop.ht')), args.n_pcs)
        pop_pca_loadings_ht.write(path('pop_pca_loadings.ht'), args.overwrite)
        pop_pca_scores_ht.write(path('pop_pca_scores.ht'), args.overwrite)

    if args.assign_pops:
        logger.info("Assigning global population labels")
        pop_pca_scores_ht = hl.read_table(path("pop_pca_scores.ht"))
        gnomad_meta_ht = get_gnomad_meta('exomes').select("pop")[
            pop_pca_scores_ht.key]
        pop_pca_scores_ht = pop_pca_scores_ht.annotate(known_pop=hl.or_missing(
            gnomad_meta_ht.pop != "oth", gnomad_meta_ht.pop))
        pop_ht, pops_rf_model = assign_population_pcs(
            pop_pca_scores_ht,
            pc_cols=pop_pca_scores_ht.scores[:args.n_pcs],
            known_col='known_pop',
            min_prob=args.min_pop_prob)

        pop_ht.write(path('pop.ht'), args.overwrite)
        with hl.hadoop_open(path('pop_rf_model.pkl'), 'wb') as out:
            pickle.dump(pops_rf_model, out)

    if args.assign_subpops:
        qc_mt = hl.read_matrix_table(path('qc.mt'))
        pop_ht = hl.read_table(path('pop.ht'))
        meta_ht = hl.read_table(args.meta)[qc_mt.col_key]
        qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop,
                                    is_case=meta_ht.is_case,
                                    country=meta_ht.country)

        platform_specific_intervals = get_platform_specific_intervals(
            hl.read_table(path('platform_pca_loadings.ht')), threshold=0.01)
        logger.info(
            f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.'
        )
        qc_mt = hl.filter_intervals(qc_mt,
                                    platform_specific_intervals,
                                    keep=False)

        assign_and_write_subpops(
            qc_mt,
            hl.read_table(path('related_samples_to_drop.ht')),
            min_samples_for_subpop=args.min_samples_for_subpop,
            n_pcs=args.n_pcs,
            min_pop_prob=args.min_pop_prob,
            overwrite=args.overwrite,
            pop_ann='pop',
            subpop_ann='country',
            include_in_pop_count=qc_mt.is_case)

    if args.run_kgp_pca:
        logger.info("Joining data with 1000 Genomes")
        qc_mt = hl.read_matrix_table(
            path('qc.mt')).select_rows().select_entries("GT")
        qc_mt = qc_mt.select_cols(known_pop=hl.null(hl.tstr),
                                  known_subpop=hl.null(hl.tstr))
        qc_mt = qc_mt.key_cols_by(_kgp=False, *qc_mt.col_key)

        kgp_mt = hl.read_matrix_table(
            kgp_phase3_genotypes_mt_path()).select_rows()
        kgp_mt = kgp_mt.select_cols(known_pop=kgp_mt.super_pops.get(
            kgp_mt.population, "oth").lower(),
                                    known_subpop=kgp_mt.population.lower())
        kgp_mt = kgp_mt.filter_rows(hl.is_defined(
            qc_mt.rows()[kgp_mt.row_key]))
        kgp_mt = filter_rows_for_qc(kgp_mt)
        kgp_mt = kgp_mt.key_cols_by(_kgp=True, *kgp_mt.col_key)

        union_kgp_qc_mt = qc_mt.union_cols(kgp_mt)
        union_kgp_qc_mt.write(path('union_kgp_qc.mt'),
                              overwrite=args.overwrite)

        logger.info("Computing PCA on data with 1000 Genomes")
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        related_samples_to_drop_ht = hl.read_table(
            path('related_samples_to_drop.ht'))
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by(
            _kgp=False, *related_samples_to_drop_ht.key)
        pca_evals, union_kgp_pca_scores_ht, union_kgp_pca_loadings_ht = run_pca_with_relateds(
            union_kgp_qc_mt, related_samples_to_drop_ht, args.n_kgp_pcs)
        union_kgp_pca_loadings_ht.write(path('union_kgp_pca_loadings.ht'),
                                        args.overwrite)
        union_kgp_pca_scores_ht.write(path('union_kgp_pca_scores.ht'),
                                      args.overwrite)

    if args.assign_pops_kgp:
        logger.info("Assigning populations based on 1000 Genomes labels")
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        union_kgp_pca_scores_ht = hl.read_table(
            path('union_kgp_pca_scores.ht'))
        union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.annotate(
            known_pop=union_kgp_qc_mt[union_kgp_pca_scores_ht.key].known_pop)
        union_kgp_pop_ht, union_kgp_pop_rf_model = assign_population_pcs(
            union_kgp_pca_scores_ht,
            pc_cols=union_kgp_pca_scores_ht.scores[:args.n_kgp_pcs],
            known_col='known_pop',
            min_prob=args.min_kgp_pop_prob)

        union_kgp_pop_ht.write(path('union_kgp_pop.ht'), args.overwrite)

        with hl.hadoop_open(path('union_kgp_pop_rf_model.pkl'), 'wb') as out:
            pickle.dump(union_kgp_pop_rf_model, out)

    if args.assign_subpops_kgp:
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        meta_ht = hl.read_table(args.meta)
        union_kgp_pop_ht = hl.read_table(path('union_kgp_pop.ht'))
        union_kgp_qc_mt = union_kgp_qc_mt.annotate_cols(
            is_case=meta_ht[union_kgp_qc_mt.col_key].is_case,
            pop=union_kgp_pop_ht[union_kgp_qc_mt.col_key].pop)

        platform_specific_intervals = get_platform_specific_intervals(
            hl.read_table(path('platform_pca_loadings.ht')))
        logger.info(
            f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.'
        )
        union_kgp_qc_mt = hl.filter_intervals(union_kgp_qc_mt,
                                              platform_specific_intervals,
                                              keep=False)

        related_samples_to_drop_ht = hl.read_table(
            path('related_samples_to_drop.ht'))
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by(
            _kgp=False, *related_samples_to_drop_ht.key)

        assign_and_write_subpops(
            union_kgp_qc_mt,
            related_samples_to_drop_ht,
            min_samples_for_subpop=args.min_samples_for_subpop,
            n_pcs=args.n_kgp_pcs,
            min_pop_prob=args.min_kgp_pop_prob,
            overwrite=args.overwrite,
            pop_ann='pop',
            subpop_ann='known_subpop',
            include_in_pop_count=union_kgp_qc_mt.is_case,
            files_prefix='union_kgp_')

    if args.apply_stratified_filters:
        logger.info("Computing stratified QC")
        for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']:
            sample_qc_ht = hl.read_table(
                path(f'{variant_class_prefix}sample_qc.ht'))
            pop_ht = hl.read_table(path('pops.ht'))
            platform_ht = hl.read_table(path('platform_pca_results.ht'))
            sample_qc_ht = sample_qc_ht.annotate(
                qc_pop=pop_ht[sample_qc_ht.key].pop,
                qc_platform=platform_ht[sample_qc_ht.key].qc_platform)
            stratified_metrics_ht = compute_stratified_metrics_filter(
                sample_qc_ht, args.filtering_qc_metrics.split(","),
                ['qc_pop', 'qc_platform'])
            stratified_metrics_ht.write(
                path(f'{variant_class_prefix}stratified_metrics_filters.ht'),
                overwrite=args.overwrite)

    if args.write_full_meta:
        logger.info("Writing metadata table")

        # List all tables to join with the base meta
        meta_annotation_hts = [
            hl.read_table(path('platform_pca_results.ht')).rename(
                {'scores': 'platform_pc_scores'}),
            hl.read_table(path('sex.ht')),
            flatten_duplicate_samples_ht(hl.read_table(path('duplicates.ht'))),
            hl.read_table(path('related_samples_to_drop.ht')).select(
                related_filtered=True),
            hl.read_table(path('pca_scores.ht')).rename(
                {'scores': 'pop_pc_scores'}),
            hl.read_table(path('pops.ht')).select('pop'),
            hl.read_table(path('nfe.pca_scores.ht')).rename(
                {'scores': 'nfe_pc_scores'}),
            hl.read_table(path('subpops.nfe.ht')).select('subpop')
        ]

        # union_kgp_pops_ht = hl.read_table(path('union_kgp_pops.ht'))
        # union_kgp_pops_ht = union_kgp_pops_ht.filter(~union_kgp_pops_ht._kgp).key_by('s')
        # union_kgp_pops_ht = union_kgp_pops_ht.select(kgp_pop=union_kgp_pops_ht.pop)
        # meta_annotation_hts.append(union_kgp_pops_ht)
        #
        # union_kgp_pca_scores_ht = hl.read_table(path('union_kgp_pca_scores.ht')).rename({'scores': 'kgp_pop_pc_scores'})
        # union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.filter(~union_kgp_pca_scores_ht._kgp).key_by('s')
        # meta_annotation_hts.append(union_kgp_pca_scores_ht)

        gnomad_meta_ht = get_gnomad_meta('exomes')
        gnomad_meta_ht = gnomad_meta_ht.select(
            gnomad_pop=gnomad_meta_ht.pop, gnomad_subpop=gnomad_meta_ht.subpop)
        meta_annotation_hts.append(gnomad_meta_ht)

        for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']:
            sample_qc_ht = hl.read_table(
                path(f'{variant_class_prefix}sample_qc.ht'))
            stratified_metrics_filters_ht = hl.read_table(
                path(f'{variant_class_prefix}stratified_metrics_filters.ht'))
            if variant_class_prefix:
                sample_qc_ht = sample_qc_ht.rename(
                    {'sample_qc': f'{variant_class_prefix}sample_qc'})
                stratified_metrics_filters_ht = stratified_metrics_filters_ht.rename(
                    {
                        f: f'{variant_class_prefix}{f}'
                        for f in list(stratified_metrics_filters_ht.globals) +
                        list(stratified_metrics_filters_ht.row_value)
                    })
            meta_annotation_hts.extend(
                [sample_qc_ht, stratified_metrics_filters_ht])

        meta_ht = hl.read_table(args.meta)
        meta_ht = meta_ht.annotate_globals(
            **{
                name: expr
                for ann_ht in meta_annotation_hts
                for name, expr in ann_ht.index_globals().items()
            })

        meta_ht = meta_ht.annotate(
            **{
                name: expr
                for ann_ht in meta_annotation_hts
                for name, expr in ann_ht[meta_ht.key].items()
            })

        filtering_col_prefix = '' if args.filtering_variant_class == 'all' else args.filtering_variant_class + "_"
        meta_ht = meta_ht.annotate_globals(
            filtering_variant_class=args.filtering_variant_class)
        meta_ht = meta_ht.annotate(sample_filters=add_filters_expr(
            filters={
                "ambiguous sex": hl.is_missing(meta_ht.is_female),
                'call_rate': meta_ht.sample_qc.call_rate < args.min_call_rate,
                'duplicate': hl.is_defined(meta_ht.dup_filtered)
                & meta_ht.dup_filtered,
                'related': meta_ht.related_filtered
            },
            current_filters=meta_ht[
                f'{filtering_col_prefix}pop_platform_filters']))

        meta_ht.write(path('full_meta.ht'), overwrite=args.overwrite)
    #    f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt", overwrite=True)
    pruned_mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt")
    # PC relate
    pruned_mt = pruned_mt.select_entries(
        GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles()))

    eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                           k=10,
                                           compute_loadings=False)
    #  scores.write(
    #      f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_pruned.pca_scores.ht", overwrite=True)

    relatedness_ht = hl.pc_relate(pruned_mt.GT,
                                  min_individual_maf=0.05,
                                  scores_expr=scores[pruned_mt.col_key].scores,
                                  block_size=4096,
                                  min_kinship=0.05,
                                  statistics='kin2')
    # relatedness_ht.write(
    #     f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_relatedness.ht", overwrite=True)
    pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125)
    related_samples_to_remove = hl.maximal_independent_set(pairs.i,
                                                           pairs.j,
                                                           keep=False)
    # related_samples_to_remove.write(
    #     f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht", overwrite=True)

    pca_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                   keep=False)
    related_mt = pruned_mt.filter_cols(hl.is_defined(
Exemplo n.º 22
0
# Writing out the matrix table with annotated filter information
# Writing out the joint matrix makes the following QC steps run faster
mt.write('path/for/joint/matrix')

# In[9]:

# Reading in the matrix table with all of the site mts combined
joint_data = 'path/for/joint/matrix'
mt_joint = hl.read_matrix_table(joint_data)

# In[12]:

# Calculates relatedness using pc_relate for all samples in a matrix table
# Annotates a column which flags those who failed the relatedness filter as True

pc_rel = hl.pc_relate(mt_joint.GT, 0.001, k=10, statistics='kin')
pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
related_samples_to_remove = hl.maximal_independent_set(pairs.i,
                                                       pairs.j,
                                                       keep=False)
mt_joint = mt_joint.annotate_cols(
    related_filter=hl.is_defined(related_samples_to_remove[mt_joint.col_key]))

# In[11]:
'''
--- Conducting QC ---

QC Steps:
    snp call rate
    sample call rate
    sex violations
Exemplo n.º 23
0
@author: nbaya
"""

import hail as hl
hl.init(log='/tmp/foo.log')

wd = 'gs://qc-nbaya/spark/array_May2019/preimputation/spark_preimp7/'

bfile = wd + 'SPARK.27K.genotype.20190501.hg19_preimp7.founders'
#print(f'Using bfile: {bfile}')
#mt = hl.import_plink(bed=bfile+'.bed',
#                     bim=bfile+'.bim',
#                     fam=bfile+'.fam')
#
#mt = mt.checkpoint(bfile+'.mt')

mt = hl.read_matrix_table(bfile + '.mt')

min_kinship = 0.09375 / 2

pcrelate = hl.pc_relate(call_expr=mt.GT,
                        min_individual_maf=0.01,
                        k=20,
                        min_kinship=min_kinship,
                        statistics='kin')

ct = pcrelate.count()

print('\n############\ncount:{ct}\n############')

pcrelate.export(bfile + '.pc_relate.v2.tsv.bgz')