예제 #1
0
    def test_ibd(self):
        dataset = self.get_dataset()

        def plinkify(ds, min=None, max=None):
            vcf = utils.new_temp_file(prefix="plink", suffix="vcf")
            plinkpath = utils.new_temp_file(prefix="plink")
            hl.export_vcf(ds, vcf)
            threshold_string = "{} {}".format("--min {}".format(min) if min else "",
                                              "--max {}".format(max) if max else "")

            plink_command = "plink --double-id --allow-extra-chr --vcf {} --genome full --out {} {}" \
                .format(utils.uri_path(vcf),
                        utils.uri_path(plinkpath),
                        threshold_string)
            result_file = utils.uri_path(plinkpath + ".genome")

            syscall(plink_command, shell=True, stdout=DEVNULL, stderr=DEVNULL)

            ### format of .genome file is:
            # _, fid1, iid1, fid2, iid2, rt, ez, z0, z1, z2, pihat, phe,
            # dst, ppc, ratio, ibs0, ibs1, ibs2, homhom, hethet (+ separated)

            ### format of ibd is:
            # i (iid1), j (iid2), ibd: {Z0, Z1, Z2, PI_HAT}, ibs0, ibs1, ibs2
            results = {}
            with open(result_file) as f:
                f.readline()
                for line in f:
                    row = line.strip().split()
                    results[(row[1], row[3])] = (list(map(float, row[6:10])),
                                                 list(map(int, row[14:17])))
            return results

        def compare(ds, min=None, max=None):
            plink_results = plinkify(ds, min, max)
            hail_results = hl.identity_by_descent(ds, min=min, max=max).collect()

            for row in hail_results:
                key = (row.i, row.j)
                self.assertAlmostEqual(plink_results[key][0][0], row.ibd.Z0, places=4)
                self.assertAlmostEqual(plink_results[key][0][1], row.ibd.Z1, places=4)
                self.assertAlmostEqual(plink_results[key][0][2], row.ibd.Z2, places=4)
                self.assertAlmostEqual(plink_results[key][0][3], row.ibd.PI_HAT, places=4)
                self.assertEqual(plink_results[key][1][0], row.ibs0)
                self.assertEqual(plink_results[key][1][1], row.ibs1)
                self.assertEqual(plink_results[key][1][2], row.ibs2)

        compare(dataset)
        compare(dataset, min=0.0, max=1.0)
        dataset = dataset.annotate_rows(dummy_maf=0.01)
        hl.identity_by_descent(dataset, dataset['dummy_maf'], min=0.0, max=1.0)
        hl.identity_by_descent(dataset, hl.float32(dataset['dummy_maf']), min=0.0, max=1.0)
예제 #2
0
        def compare(ds, min=None, max=None):
            plink_results = plinkify(ds, min, max)
            hail_results = hl.identity_by_descent(ds, min=min, max=max).collect()

            for row in hail_results:
                key = (row.i, row.j)
                self.assertAlmostEqual(plink_results[key][0][0], row.ibd.Z0, places=4)
                self.assertAlmostEqual(plink_results[key][0][1], row.ibd.Z1, places=4)
                self.assertAlmostEqual(plink_results[key][0][2], row.ibd.Z2, places=4)
                self.assertAlmostEqual(plink_results[key][0][3], row.ibd.PI_HAT, places=4)
                self.assertEqual(plink_results[key][1][0], row.ibs0)
                self.assertEqual(plink_results[key][1][1], row.ibs1)
                self.assertEqual(plink_results[key][1][2], row.ibs2)
예제 #3
0
def compute_kinship_ht(mt, genome_version="GRCh38"):

    mt = filter_to_biallelics(mt)
    mt = filter_to_autosomes(mt)
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))

    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.call_rate > 0.99)
    #mt = mt.filter_rows(mt.info.AF > 0.001) # leaves 100% of variants

    mt = ld_prune(mt, genome_version=genome_version)

    ibd_results_ht = hl.identity_by_descent(mt,
                                            maf=mt.info.AF,
                                            min=0.10,
                                            max=1.0)
    ibd_results_ht = ibd_results_ht.annotate(
        ibd0=ibd_results_ht.ibd.Z0,
        ibd1=ibd_results_ht.ibd.Z1,
        ibd2=ibd_results_ht.ibd.Z2,
        pi_hat=ibd_results_ht.ibd.PI_HAT).drop("ibs0", "ibs1", "ibs2", "ibd")

    kin_ht = ibd_results_ht

    # filter to anything above the relationship of a grandparent
    first_degree_pi_hat = .40
    grandparent_pi_hat = .20
    grandparent_ibd1 = 0.25
    grandparent_ibd2 = 0.15

    kin_ht = kin_ht.key_by("i", "j")
    kin_ht = kin_ht.filter((kin_ht.pi_hat > first_degree_pi_hat) | (
        (kin_ht.pi_hat > grandparent_pi_hat) & (kin_ht.ibd1 > grandparent_ibd1)
        & (kin_ht.ibd2 < grandparent_ibd2)))

    kin_ht = kin_ht.annotate(relation=hl.sorted([kin_ht.i, kin_ht.j
                                                 ]))  #better variable name

    return kin_ht
예제 #4
0
def relatedness_check(in_mt: hl.MatrixTable = None,
                      method: str = 'pc_relate',
                      outdir: str = None,
                      kin_estimate: float = 0.98):

    global mt, samples_to_remove

    in_mt = hl.variant_qc(in_mt)
    in_mt = hl.sample_qc(in_mt)

    # _localize=False means don't put this in Python, keep it as a Hail expr
    call_rate_dict = in_mt.aggregate_cols(hl.dict(
        hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))),
                                          _localize=False)

    if method == 'pc_relate':
        print("\nUsing PC-Relate for relatedness checks")
        relatedness_ht = hl.pc_relate(in_mt.GT,
                                      0.01,
                                      k=10,
                                      min_kinship=0.1,
                                      statistics='kin')
        samples_to_remove_ht = relatedness_ht.filter(
            relatedness_ht.kin > kin_estimate)

        # get call rates for both samples so we remove the one with lower call rate between the two
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.i.s],
            cr_s2=call_rate_dict[samples_to_remove_ht.j.s])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.i, samples_to_remove.j))

    elif method == 'ibd':
        print("\nUsing PLINK-style identity by descent for relatedness checks")
        in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF))
        relatedness_ht = hl.identity_by_descent(
            in_mt, maf=in_mt['maf']
        )  # this returns a Hail Table with the sample pairs
        samples_to_remove_ht = relatedness_ht.filter(
            relatedness_ht.ibd.PI_HAT > kin_estimate)

        # get call rates for both samples so we remove the one with lower call rate between the two
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.i],
            cr_s2=call_rate_dict[samples_to_remove_ht.j])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.i, samples_to_remove.j))

    else:
        print("\nUsing KING for relatedness checks")
        if kin_estimate > 0.5:
            raise Exception(
                "\nThe maximum kinship coefficient is for KING 0.5")
        relatedness_mt = hl.king(in_mt.GT)
        filtered_relatedness_mt = relatedness_mt.filter_entries(
            (relatedness_mt.s_1 != relatedness_mt.s) &
            (relatedness_mt.phi >= kin_estimate),
            keep=True)
        samples_to_remove_ht = filtered_relatedness_mt.entries()
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.s_1],
            cr_s2=call_rate_dict[samples_to_remove_ht.s])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.s_1, samples_to_remove.s))

    samples = samples_list.sample_to_remove.collect()

    if len(samples) > 0:
        in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']),
                                  keep=False)
        print("\nNumber of samples that fail relatedness checks: {}".format(
            len(samples)))
        with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f:
            for sample in samples:
                f.write(sample + "\n")

    else:
        print("\nNo samples failed the relatedness check")

    return in_mt
예제 #5
0
mt = hl.sample_qc(mt)
# Calculate statistics on sample statistics
stats_singleton = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_singleton))
stats_ti_tv = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_ti_tv))
stats_het_hom_var = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_het_hom_var))
stats_het = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_het))

######## 3.2 Sex check on chromosome X (inbreeding coefficient)
# Determine sex from GT calls in sex chromosomes
t = hl.impute_sex(mt.GT)
# Only keep those where genetic sex matches self-reported Sex
mt = mt.filter_cols(t[mt.s].is_female == mt.is_female)

######## 3.3 Check for genetic relationship / "duplicates"
# Calculate identity-by-descent matrix
mt_relatedness = hl.identity_by_descent(mt)
# keep pairs of samples with PI_HAT in [0.2, 1] using MAF computed from the dataset itself in row field panel_maf.
t_ibd = relatedness.filter(relatedness.ibd.PI_HAT > 0.2)
t_ibd.key_by('i')
mt.key_cols_by("s")
#Collect the IDs of the related samples in t_ibd
ibd_idx = t_ibd.aggregate(hl.agg.collect_as_set(t_ibd.i))
mt_ibd = mt.filter_cols(hl.is_defined(ibd_idx))

######### 3.3 Filter samples for outliers more than (6 * SD) from mean (Part 2)
# Number of singletons
mt = mt.filter_cols(mt.sample_qc.n_singleton < (stats_singleton.mean +
                                                (6 * stats_singleton.stdev)))
mt = mt.filter_cols(mt.sample_qc.n_singleton > (stats_singleton.mean -
                                                (6 * stats_singleton.stdev)))
#Ti/Tv ratio
예제 #6
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 11 09:46:37 2018

@author: nbaya
"""

import hail as hl

phen='50'
variant_set='hm3'
n_chunks=300
batch='1'

mt = hl.read_matrix_table('gs://nbaya/split/ukb31063.'+variant_set+'_variants.gwas_samples_'+phen+'_grouped'+str(n_chunks)+'_batch_'+batch+'.mt') 

mt1 = mt.annotate_entries(gt = hl.int(hl.int(mt.dosage*3/2)*2/3))

mt1 = mt1.annotate_entries(GT = hl.call(mt1.gt))

hl.identity_by_descent(mt1).write('gs://nbaya/split/ibd.'+variant_set+'_variants.'+phen+'_grouped'+str(n_chunks)+'_batch_'+batch+'.ht')

예제 #7
0
def main(args):
    output_dir = args.output_dir
    output_name = args.output_name
    inferred_sex = args.inferred_sex
    mt_path = args.mt_path
    input_pedigree = args.input_pedigree

    gnomad_ld = args.gnomad_ld
    run_ibd = args.run_ibd
    first_degree_pi_hat = args.first_degree_pi_hat
    grandparent_pi_hat = args.grandparent_pi_hat
    grandparent_ibd1 = args.grandparent_ibd1
    grandparent_ibd2 = args.grandparent_ibd2
    filter_kinship_ht = args.filter_kinship_ht

    logger.info("Reading in inputs...")
    mt = hl.read_matrix_table(mt_path)
    pedigree = hl.import_table(input_pedigree, impute=True)

    # Infer build of the MatrixTable
    build = get_reference_genome(mt.locus).name

    logger.info(
        "Filtering to biallelic SNVs on autosomes and performing LD pruning..."
    )
    mt = filter_rows_for_qc(mt,
                            min_af=0.001,
                            min_callrate=0.99,
                            apply_hard_filters=False)
    mt = ld_prune(mt, build, gnomad_ld)
    out_mt = f"{output_dir}/{output_name}_processed_mt.mt"

    logger.info("Remapping sample names...")
    mt, sex_ht = remap_samples(mt_path, mt, pedigree, inferred_sex)

    mt = mt.checkpoint(out_mt, overwrite=True)

    if run_ibd:
        logger.info("Running identity by descent...")
        ibd_results_ht = hl.identity_by_descent(mt,
                                                maf=mt.AF,
                                                min=0.10,
                                                max=1.0)
        ibd_results_ht = ibd_results_ht.annotate(
            ibd0=ibd_results_ht.ibd.Z0,
            ibd1=ibd_results_ht.ibd.Z1,
            ibd2=ibd_results_ht.ibd.Z2,
            pi_hat=ibd_results_ht.ibd.PI_HAT,
        ).drop("ibs0", "ibs1", "ibs2", "ibd")
        out_ht = f"{output_dir}/{output_name}_ibd_kinship.tsv"
        ibd_results_ht.export(out_ht)

    else:
        logger.warn("Skipping IBD - using previous calculations...")
        if not file_exists(f"{output_dir}/{output_name}_ibd_kinship.tsv"):
            logger.warning(
                "IBD calculation was skipped but no file with previous calculations was found...",
                sample,
            )

    logger.info("Reading in kinship ht...")
    kin_ht = hl.import_table(f"{output_dir}/{output_name}_ibd_kinship.tsv",
                             impute=True)

    # Subset MatrixTable and sex ht to the samples in the pedigree
    mt_subset, sex_ht, expected_samples, vcf_samples = subset_samples(
        mt, pedigree, sex_ht, output_dir, output_name)

    # Subset Table to the samples in the pedigree
    subset = hl.set(expected_samples)
    kin_ht = kin_ht.filter(
        subset.contains(kin_ht.i) | subset.contains(kin_ht.j))

    # Key the Table
    kin_ht = kin_ht.key_by("i", "j")

    # Setup output file
    out_summary = hl.hadoop_open(
        f"{output_dir}/{output_name}_ped_check_summary.txt", "w")

    if filter_kinship_ht:
        logger.info(
            "Filtering kinship table to remove unrelated individuals from analysis..."
        )
        kin_ht = filter_kin_ht(kin_ht, out_summary)

    # Output basic stats
    out_summary.write("Number individuals in pedigree: " +
                      str(len(expected_samples)) + "\n")
    out_summary.write("Number individuals in subset from the VCF: " +
                      str(len(vcf_samples)) + "\n")
    out_summary.write("Number of relationships in the kinship table: " +
                      str(kin_ht.count()) + "\n\n")
    out_summary.close()

    seqr_projects, family_ids, given_sex = write_functional_pedigree(
        input_pedigree, vcf_samples, output_dir, output_name)

    # Compare inferred and given sex
    check_sex(sex_ht, output_dir, output_name)

    kin_ht = add_project_and_family_annotations(kin_ht, seqr_projects,
                                                family_ids)

    logger.info("Writing kinship ht per project...")
    # Output original ht per project
    for project in set(seqr_projects.values()):
        full_ht = kin_ht.filter((kin_ht.seqr_proj_i == project)
                                | (kin_ht.seqr_proj_j == project))
        full_ht.drop("seqr_proj_i", "seqr_proj_j").export(
            f"{output_dir}/{project}/{output_name}_{project}_annotated_kin.txt"
        )