예제 #1
0
def test_king_filtered_entries_no_error():
    plink_path = resource('balding-nichols-1024-variants-4-samples-3-populations')
    mt = hl.import_plink(bed=f'{plink_path}.bed',
                         bim=f'{plink_path}.bim',
                         fam=f'{plink_path}.fam')
    mt = mt.filter_entries(hl.rand_bool(0.5))
    hl.king(mt.GT)._force_count_rows()
예제 #2
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    mt = mt.filter_cols(
        (mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
        | (mt.s.contains('TOB'))
    )
    # Remove related samples (at the 2nd degree or closer)
    king = hl.king(mt.GT)
    king_path = output_path('king_kinship_estimate_NFE.ht')
    king.write(king_path)
    ht = king.entries()
    related_samples = ht.filter((ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True)
    struct = hl.struct(i=related_samples.s_1, j=related_samples.s)
    struct = struct.annotate(phi=related_samples.phi)
    related_samples_to_remove = hl.maximal_independent_set(
        struct.i, struct.j, False  # pylint: disable=E1101
    )
    n_related_samples = related_samples_to_remove.count()
    print(f'related_samples_to_remove.count() = {n_related_samples}')
    # save as html
    html = pd.DataFrame(
        {'related_individual': related_samples_to_remove.node.collect()}
    ).to_html()
    plot_filename_html = output_path(f'related_samples.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)
예제 #3
0
def test_king_large():
    plink_path = resource('fastlmmTest')
    mt = hl.import_plink(bed=f'{plink_path}.bed',
                         bim=f'{plink_path}.bim',
                         fam=f'{plink_path}.fam',
                         reference_genome=None)
    kinship = hl.king(mt.GT)
    assert_c_king_same_as_hail_king(resource('fastlmmTest.kin0.bgz'), kinship)
예제 #4
0
def test_king_small():
    plink_path = resource('balding-nichols-1024-variants-4-samples-3-populations')
    mt = hl.import_plink(bed=f'{plink_path}.bed',
                         bim=f'{plink_path}.bim',
                         fam=f'{plink_path}.fam')
    kinship = hl.king(mt.GT)
    assert_c_king_same_as_hail_king(
        resource('balding-nichols-1024-variants-4-samples-3-populations.kin0'),
        kinship)
예제 #5
0
def relatedness_check(in_mt: hl.MatrixTable = None,
                      method: str = 'pc_relate',
                      outdir: str = None,
                      kin_estimate: float = 0.98):

    global mt, samples_to_remove

    in_mt = hl.variant_qc(in_mt)
    in_mt = hl.sample_qc(in_mt)

    # _localize=False means don't put this in Python, keep it as a Hail expr
    call_rate_dict = in_mt.aggregate_cols(hl.dict(
        hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))),
                                          _localize=False)

    if method == 'pc_relate':
        print("\nUsing PC-Relate for relatedness checks")
        relatedness_ht = hl.pc_relate(in_mt.GT,
                                      0.01,
                                      k=10,
                                      min_kinship=0.1,
                                      statistics='kin')
        samples_to_remove_ht = relatedness_ht.filter(
            relatedness_ht.kin > kin_estimate)

        # get call rates for both samples so we remove the one with lower call rate between the two
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.i.s],
            cr_s2=call_rate_dict[samples_to_remove_ht.j.s])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.i, samples_to_remove.j))

    elif method == 'ibd':
        print("\nUsing PLINK-style identity by descent for relatedness checks")
        in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF))
        relatedness_ht = hl.identity_by_descent(
            in_mt, maf=in_mt['maf']
        )  # this returns a Hail Table with the sample pairs
        samples_to_remove_ht = relatedness_ht.filter(
            relatedness_ht.ibd.PI_HAT > kin_estimate)

        # get call rates for both samples so we remove the one with lower call rate between the two
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.i],
            cr_s2=call_rate_dict[samples_to_remove_ht.j])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.i, samples_to_remove.j))

    else:
        print("\nUsing KING for relatedness checks")
        if kin_estimate > 0.5:
            raise Exception(
                "\nThe maximum kinship coefficient is for KING 0.5")
        relatedness_mt = hl.king(in_mt.GT)
        filtered_relatedness_mt = relatedness_mt.filter_entries(
            (relatedness_mt.s_1 != relatedness_mt.s) &
            (relatedness_mt.phi >= kin_estimate),
            keep=True)
        samples_to_remove_ht = filtered_relatedness_mt.entries()
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.s_1],
            cr_s2=call_rate_dict[samples_to_remove_ht.s])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.s_1, samples_to_remove.s))

    samples = samples_list.sample_to_remove.collect()

    if len(samples) > 0:
        in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']),
                                  keep=False)
        print("\nNumber of samples that fail relatedness checks: {}".format(
            len(samples)))
        with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f:
            for sample in samples:
                f.write(sample + "\n")

    else:
        print("\nNo samples failed the relatedness check")

    return in_mt
예제 #6
0
def king():
    mt = hl.balding_nichols_model(6, n_variants=10000, n_samples=4096)
    path = hl.utils.new_temp_file(extension='mt')
    hl.king(mt.GT).write(path, overwrite=True)