Пример #1
0
 def test_filter_alleles(self):
     # poor man's Gen
     paths = [resource('sample.vcf'),
              resource('multipleChromosomes.vcf'),
              resource('sample2.vcf')]
     for path in paths:
         ds = hl.import_vcf(path)
         self.assertEqual(
             hl.filter_alleles(ds, lambda a, i: False).count_rows(), 0)
         self.assertEqual(hl.filter_alleles(ds, lambda a, i: True).count_rows(), ds.count_rows())
Пример #2
0
 def test_filter_alleles(self):
     # poor man's Gen
     paths = [resource('sample.vcf'),
              resource('multipleChromosomes.vcf'),
              resource('sample2.vcf')]
     for path in paths:
         ds = hl.import_vcf(path)
         self.assertEqual(
             hl.filter_alleles(ds, lambda a, i: False).count_rows(), 0)
         self.assertEqual(hl.filter_alleles(ds, lambda a, i: True).count_rows(), ds.count_rows())
def split_mt_to_indels(mt: hl.MatrixTable) -> hl.MatrixTable:
    '''

    :param mt: hail matrixtable of all samples with both indels and SNVs
    :return: hail matrixtable with only the indels
    '''
    mt_indels = hl.filter_alleles(
        mt, lambda allele, _: hl.is_indel(mt.alleles[0], allele))
    return mt_indels
def split_mt_to_snps(mt: hl.MatrixTable) -> hl.MatrixTable:
    '''

    :param mt: hail matrixtable of all samples with both indels and SNVs
    :return: matrixtable with only the SNPs from all the samples
    '''

    mt_snps = hl.filter_alleles(
        mt, lambda allele, _: hl.is_snp(mt.alleles[0], allele))
    return mt_snps
Пример #5
0
exomes_proband = exomes.filter_cols(exomes.s == exomes.source_trio.proband.s)
exomes_fath = exomes.filter_cols(exomes.s == exomes.source_trio.father.s)
exomes_moth = exomes.filter_cols(exomes.s == exomes.source_trio.mother.s)

exomes_proband = exomes_proband.key_cols_by(
    exomes_proband['source_trio'].fam_id)
exomes_fath = exomes_fath.key_cols_by(exomes_fath['source_trio'].fam_id)
exomes_moth = exomes_moth.key_cols_by(exomes_moth['source_trio'].fam_id)

exomes_proband = exomes_proband.annotate_entries(
    mother_PBT_GT=exomes_moth[exomes_proband.row_key,
                              exomes_proband.col_key]["PBT_GT"],
    father_PBT_GT=exomes_fath[exomes_proband.row_key,
                              exomes_proband.col_key]["PBT_GT"])
exomes_proband = exomes_proband.annotate_entries(
    hethet=((exomes_proband.mother_PBT_GT.is_het_ref())
            & (exomes_proband.father_PBT_GT.is_het_ref())))

exomes_proband = hl.filter_alleles(exomes_proband, lambda allele, i: hl.is_snp(
    exomes_proband.alleles[0], allele))  # currently take only SNP
exomes_proband = exomes_proband.filter_entries(
    exomes_proband.GT.is_het())  # throw away unwanted entries (non alt)

exomes_proband_et = exomes_proband.entries()
exomes_proband_et = exomes_proband_et.filter(exomes_proband_et.adj)
#exomes_proband_et = exomes_proband_et.filter(exomes_proband_et.GT.is_het_ref() & exomes_proband_et.GT.is_diploid())
aggstats = exomes_proband_et.group_by(
    "GT", 'PBT_GT', 'mother_PBT_GT',
    "father_PBT_GT").aggregate(n=hl.agg.count())
aggstats.write("gs://gnomad_qingbowang/MNV/hethet_aggstats_exome_wGT_re.ht")
Пример #6
0
class DataException(Exception):
    pass


exomes = get_gnomad_data("genomes",
                         release_samples=True,
                         adj=True,
                         release_annotations=True)  #実際はgenomeだけど.
#ex20 = hl.filter_intervals(exomes.select_rows("allele_info").select_cols(), [hl.parse_locus_interval("20:start-2M")]) #first 2Mb
ex20 = hl.filter_intervals(
    exomes.select_rows("info_DP").select_cols(),
    [hl.parse_locus_interval("20")])
#ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=15000 samples for small test, exome
#ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=1500 samples for small test, genome
ex20 = hl.filter_alleles(ex20, lambda allele, i: hl.is_snp(
    ex20.alleles[0], allele))  # currently take only SNP
ex20 = ex20.filter_entries(
    ex20.GT.is_het())  # throw away unwanted entries (non alt)
ex20_pair = hl.window_by_locus(ex20, 2)  #just look at nearby pairs for now
ex20_pair = ex20_pair.filter_entries(
    (hl.is_defined(ex20_pair.GT) & (ex20_pair.prev_entries.length() > 0)))
ex20_pair = ex20_pair.filter_entries(
    ex20_pair.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0)
et = ex20_pair.entries()
et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows)))
et = et.explode('indices')
et = et.transmute(prev_row=et.prev_rows[et.indices],
                  prev_entry=et.prev_entries[et.indices])
et = et.filter(hl.is_defined(
    et.prev_entry.GT))  # and remove non-corresponding entries
et = et.annotate(agrees_PID=((et.GT.phased) & (et.prev_entry.GT.phased)
def phase_sensitivity_fast(mt,
                           windowsize=1,
                           adj=True):  # trying to make the above faster.
    # takes matrix table that has PID, GT, PBT_GT, calculate the phase sensitivity, sum of all individuals
    # for window size k, get the result of window size=1, 2, ... k
    import pandas as pd
    mt = hl.filter_alleles(mt, lambda allele, i: hl.is_snp(
        mt.alleles[0], allele))  # currently take only SNP
    mt = mt.select_rows()  # throw away unwanted rows
    mt = mt.filter_entries(
        mt.GT.is_het())  # throw away unwanted entries (non alt)
    mt = hl.window_by_locus(mt, windowsize)
    mt = mt.filter_entries(
        (hl.is_defined(mt.GT) & (mt.prev_entries.length() > 0)))
    mt = mt.filter_entries(
        mt.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0)
    et = mt.entries()
    et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows)))
    et = et.explode('indices')
    et = et.transmute(prev_row=et.prev_rows[et.indices],
                      prev_entry=et.prev_entries[et.indices])
    et = et.filter(hl.is_defined(
        et.prev_entry.GT))  # and remove non-corresponding entries

    if adj:  # restrict to adj pass
        et = et.filter(et.adj & et.prev_entry.adj)
    print("\n et created and filtered. \n Starting to look at phase info \n" +
          tm.ctime())
    # annotate columns
    et = et.annotate(dist=et.locus.position - et.prev_row.locus.position,
                     pair_phased=(et.GT.phased) & (et.prev_entry.GT.phased),
                     has_PBT=(hl.is_defined(et.PBT_GT)) &
                     (hl.is_defined(et.prev_entry.PBT_GT)))
    et = et.annotate(is_mnv=(et.pair_phased & (et.PID == et.prev_entry.PID)
                             & (et.GT == et.prev_entry.GT)))
    et = et.annotate(flipped_GT=hl.call(et.GT[1],
                                        et.GT[0],
                                        phased=et.GT.phased),
                     prev_entry_flipped_GT=hl.call(
                         et.prev_entry.GT[1],
                         et.prev_entry.GT[0],
                         phased=et.prev_entry.GT.phased))
    et = et.annotate(agrees_PBT=(
        ((et.GT == et.PBT_GT) & (et.prev_entry.GT == et.prev_entry.PBT_GT))
        | ((et.flipped_GT == et.PBT_GT)
           & (et.prev_entry_flipped_GT == et.prev_entry.PBT_GT))))
    et = et.annotate(agrees_PID=((et.pair_phased)
                                 & (et.PID == et.prev_entry.PID)
                                 & hl.is_defined(et.PID)))
    #agrees PID only if they are phased at all

    # define each categ
    et_has_PBT = et.filter(et.has_PBT)
    et_agrees_PBT = et.filter(et.agrees_PBT)
    et_phased = et.filter(et.pair_phased)
    et_phased_and_has_PBT = et_phased.filter(et_phased.has_PBT)
    et_phased_and_agrees_PBT = et_phased.filter(et_phased.agrees_PBT)
    et_same_PID = et.filter(et.agrees_PID)
    et_same_PID_and_has_PBT = et_same_PID.filter(et_same_PID.has_PBT)
    et_same_PID_and_agrees_PBT = et_same_PID.filter(et_same_PID.agrees_PBT)
    et_mnv = et.filter(et.is_mnv)
    et_mnv_and_has_PBT = et_mnv.filter(et_mnv.has_PBT)
    et_mnv_and_agrees_PBT = et_mnv.filter(et_mnv.agrees_PBT)
    print("Starting to aggregate \n" + tm.ctime())
    n_all = et.aggregate(hl.agg.counter(et.dist))
    n_has_PBT = et_has_PBT.aggregate(hl.agg.counter(et_has_PBT.dist))
    n_agrees_PBT = et_agrees_PBT.aggregate(hl.agg.counter(et_agrees_PBT.dist))
    n_phased = et_phased.aggregate(hl.agg.counter(et_phased.dist))
    n_phased_and_has_PBT = et_phased_and_has_PBT.aggregate(
        hl.agg.counter(et_phased_and_has_PBT.dist))
    n_phased_and_agrees_PBT = et_phased_and_agrees_PBT.aggregate(
        hl.agg.counter(et_phased_and_agrees_PBT.dist))
    n_same_PID = et_same_PID.aggregate(hl.agg.counter(et_same_PID.dist))
    n_same_PID_and_has_PBT = et_same_PID_and_has_PBT.aggregate(
        hl.agg.counter(et_same_PID_and_has_PBT.dist))
    n_same_PID_and_agrees_PBT = et_same_PID_and_agrees_PBT.aggregate(
        hl.agg.counter(et_same_PID_and_agrees_PBT.dist))
    n_mnv = et_mnv.aggregate(hl.agg.counter(et_mnv.dist))
    n_mnv_and_has_PBT = et_mnv_and_has_PBT.aggregate(
        hl.agg.counter(et_mnv_and_has_PBT.dist))
    n_mnv_and_agrees_PBT = et_mnv_and_agrees_PBT.aggregate(
        hl.agg.counter(et_mnv_and_agrees_PBT.dist))

    #also some missing: same PID and has PBT in general (not restricting to MNV) / those that agrees.
    #no we actually have it.

    print("Done aggregate \n" + tm.ctime())
    # and if we return these we are done
    df = pd.DataFrame(n_all, index=["n_all"])
    df2 = pd.DataFrame(n_has_PBT, index=["n_has_PBT"])
    df3 = pd.DataFrame(n_agrees_PBT, index=["n_agrees_PBT"])
    df4 = pd.DataFrame(n_phased, index=["n_phased"])
    df5 = pd.DataFrame(n_phased_and_has_PBT, index=["n_phased_and_has_PBT"])
    df6 = pd.DataFrame(n_phased_and_agrees_PBT,
                       index=["n_phased_and_agrees_PBT"])
    df7 = pd.DataFrame(n_mnv, index=["n_mnv"])
    df8 = pd.DataFrame(n_mnv_and_has_PBT, index=["n_mnv_and_has_PBT"])
    df9 = pd.DataFrame(n_mnv_and_agrees_PBT, index=["n_mnv_and_agrees_PBT"])
    df10 = pd.DataFrame(n_same_PID, index=["n_same_PID"])
    df11 = pd.DataFrame(n_same_PID_and_has_PBT,
                        index=["n_same_PID_and_has_PBT"])
    df12 = pd.DataFrame(n_same_PID_and_agrees_PBT,
                        index=["n_same_PID_and_agrees_PBT"])
    print(n_all)
    return (pd.concat(
        [df, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]))