Exemplo n.º 1
0
    def test_window_by_locus(self):
        mt = hl.utils.range_matrix_table(100, 2, n_partitions=10)
        mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1))
        mt = mt.key_rows_by('locus')
        mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx)
        mt = hl.window_by_locus(mt, 5).cache()

        self.assertEqual(mt.count_rows(), 100)

        rows = mt.rows()
        self.assertTrue(
            rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5)))
        self.assertTrue(
            rows.all(
                hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx,
                       hl.zip_with_index(rows.prev_rows))))

        entries = mt.entries()
        self.assertTrue(
            entries.all(
                hl.all(lambda x: x.e_col_idx == entries.col_idx,
                       entries.prev_entries)))
        self.assertTrue(
            entries.all(
                hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx,
                       hl.zip_with_index(entries.prev_entries))))
Exemplo n.º 2
0
    def test_window_by_locus(self):
        mt = hl.utils.range_matrix_table(100, 2, n_partitions=10)
        mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1))
        mt = mt.key_rows_by('locus')
        mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx)
        mt = hl.window_by_locus(mt, 5).cache()

        self.assertEqual(mt.count_rows(), 100)

        rows = mt.rows()
        self.assertTrue(rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5)))
        self.assertTrue(rows.all(hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx,
                                        hl.zip_with_index(rows.prev_rows))))

        entries = mt.entries()
        self.assertTrue(entries.all(hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries)))
        self.assertTrue(entries.all(hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx,
                                           hl.zip_with_index(entries.prev_entries))))
Exemplo n.º 3
0
exomes = get_gnomad_data("genomes",
                         release_samples=True,
                         adj=True,
                         release_annotations=True)  #実際はgenomeだけど.
#ex20 = hl.filter_intervals(exomes.select_rows("allele_info").select_cols(), [hl.parse_locus_interval("20:start-2M")]) #first 2Mb
ex20 = hl.filter_intervals(
    exomes.select_rows("info_DP").select_cols(),
    [hl.parse_locus_interval("20")])
#ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=15000 samples for small test, exome
#ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=1500 samples for small test, genome
ex20 = hl.filter_alleles(ex20, lambda allele, i: hl.is_snp(
    ex20.alleles[0], allele))  # currently take only SNP
ex20 = ex20.filter_entries(
    ex20.GT.is_het())  # throw away unwanted entries (non alt)
ex20_pair = hl.window_by_locus(ex20, 2)  #just look at nearby pairs for now
ex20_pair = ex20_pair.filter_entries(
    (hl.is_defined(ex20_pair.GT) & (ex20_pair.prev_entries.length() > 0)))
ex20_pair = ex20_pair.filter_entries(
    ex20_pair.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0)
et = ex20_pair.entries()
et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows)))
et = et.explode('indices')
et = et.transmute(prev_row=et.prev_rows[et.indices],
                  prev_entry=et.prev_entries[et.indices])
et = et.filter(hl.is_defined(
    et.prev_entry.GT))  # and remove non-corresponding entries
et = et.annotate(agrees_PID=((et.GT.phased) & (et.prev_entry.GT.phased)
                             & (et.PID == et.prev_entry.PID)
                             & hl.is_defined(et.PID)))
et = et.annotate(dist=et.locus.position -
Exemplo n.º 4
0
    #repartition -actually not needed. 10000 from the beginning.
    mt = hl.filter_intervals(mt_all, [hl.parse_locus_interval(chr)])
    rf = hl.filter_intervals(rf_all, [hl.parse_locus_interval(chr)])
    mt = mt.repartition(1000)
    rf = rf.repartition(1000)

    #let's actually filter to >15x from the beginning..
    #no, will do it for the downstream, but not here.

    #keep also AF etc info
    mt = mt.select_cols()
    mt = mt.select_rows(mt.info.AF, mt.info.AC, mt.a_index)
    mt = mt.annotate_rows(filters = rf[mt.row_key].filters) #rf as a new "filters" row
    mt = mt.annotate_rows(AC = mt.AC[mt.a_index-1], AF = mt.AF[mt.a_index-1]) #re-annotating the AF/AC
    mt = mt.filter_entries(mt.GT.is_non_ref() & hl.is_defined(mt.PID)) #throwing away unneeded things
    mt = hl.window_by_locus(mt, 10) #partition in window -- maximum 10 actually.
    mt = mt.filter_entries((hl.is_defined(mt.GT) & (mt.prev_entries.length() > 0))) #throwing away no MNV SNPs
    mt = mt.filter_entries(mt.prev_entries.filter(lambda x: x.GT.is_non_ref()).length() > 0) #same
    et = mt.key_cols_by().entries() # Matrix with 1000 rows (variant) + 1000 cols (sample)=> 1 million entries
    et = et.annotate(indices = hl.range(0, hl.len(et.prev_rows)))
    et = et.explode('indices')
    et = et.transmute(prev_row = et.prev_rows[et.indices],
                      prev_entry = et.prev_entries[et.indices])
    et = et.annotate(dist=et.locus.position - et.prev_row.locus.position) #annotating the distance
    #et.cache() #should make everything faster -> no, actually seems like making it slower..

    #het x het
    et_het = et.filter((et.GT.phased) & (et.prev_entry.GT.phased) & (et.PID == et.prev_entry.PID) & (et.GT == et.prev_entry.GT) & (et.GT.is_het_ref()) & (et.prev_entry.GT.is_het_ref())) #only het het MNVs  (= same phase)

    et_het = et_het.repartition(1000)
Exemplo n.º 5
0
])  #change the call_fields according to the hail documentation
vcf = hl.split_multi_hts(vcf)
vcf.write(sys.argv[0] + ".mt", overwrite=True)
mt = hl.read_matrix_table(sys.argv[0] + ".mt")

#calling
mt = mt.select_cols()  #dropping unneeded  columns makes things faster
mt = mt.annotate_rows(AC=mt.info.AC[mt.a_index - 1],
                      AF=mt.info.AF[mt.a_index - 1])  #for case of multiallelic
mt = mt.select_rows(
    mt.filters, mt.AC,
    mt.AF)  #or any rows that you want to store for future investigation
mt = mt.filter_entries(hl.is_defined(mt.GT)
                       & mt.GT.is_non_ref())  #interested in non-ref only.
mt = hl.window_by_locus(
    mt, 2
)  #partition in window -- we only care within codon reading frame, so the max distance is set to be 2
mt = mt.filter_entries((mt.prev_entries.length() > 0))
mt = mt.filter_entries(
    (hl.is_defined(mt.GT) &
     (mt.prev_entries.length() > 0)))  #throwing away no MNV SNPs
mt = mt.filter_entries(
    mt.prev_entries.filter(lambda x: x.GT.is_non_ref()).length() > 0)  #same

et = mt.key_cols_by().entries(
)  # Matrix with 1000 rows (variant) + 1000 cols (sample)=> 1 million entries
et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows)))
et = et.explode(
    'indices'
)  #for the case where there are more than one prev_row for a variant
et = et.transmute(prev_row=et.prev_rows[et.indices],
def phase_sensitivity_fast(mt,
                           windowsize=1,
                           adj=True):  # trying to make the above faster.
    # takes matrix table that has PID, GT, PBT_GT, calculate the phase sensitivity, sum of all individuals
    # for window size k, get the result of window size=1, 2, ... k
    import pandas as pd
    mt = hl.filter_alleles(mt, lambda allele, i: hl.is_snp(
        mt.alleles[0], allele))  # currently take only SNP
    mt = mt.select_rows()  # throw away unwanted rows
    mt = mt.filter_entries(
        mt.GT.is_het())  # throw away unwanted entries (non alt)
    mt = hl.window_by_locus(mt, windowsize)
    mt = mt.filter_entries(
        (hl.is_defined(mt.GT) & (mt.prev_entries.length() > 0)))
    mt = mt.filter_entries(
        mt.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0)
    et = mt.entries()
    et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows)))
    et = et.explode('indices')
    et = et.transmute(prev_row=et.prev_rows[et.indices],
                      prev_entry=et.prev_entries[et.indices])
    et = et.filter(hl.is_defined(
        et.prev_entry.GT))  # and remove non-corresponding entries

    if adj:  # restrict to adj pass
        et = et.filter(et.adj & et.prev_entry.adj)
    print("\n et created and filtered. \n Starting to look at phase info \n" +
          tm.ctime())
    # annotate columns
    et = et.annotate(dist=et.locus.position - et.prev_row.locus.position,
                     pair_phased=(et.GT.phased) & (et.prev_entry.GT.phased),
                     has_PBT=(hl.is_defined(et.PBT_GT)) &
                     (hl.is_defined(et.prev_entry.PBT_GT)))
    et = et.annotate(is_mnv=(et.pair_phased & (et.PID == et.prev_entry.PID)
                             & (et.GT == et.prev_entry.GT)))
    et = et.annotate(flipped_GT=hl.call(et.GT[1],
                                        et.GT[0],
                                        phased=et.GT.phased),
                     prev_entry_flipped_GT=hl.call(
                         et.prev_entry.GT[1],
                         et.prev_entry.GT[0],
                         phased=et.prev_entry.GT.phased))
    et = et.annotate(agrees_PBT=(
        ((et.GT == et.PBT_GT) & (et.prev_entry.GT == et.prev_entry.PBT_GT))
        | ((et.flipped_GT == et.PBT_GT)
           & (et.prev_entry_flipped_GT == et.prev_entry.PBT_GT))))
    et = et.annotate(agrees_PID=((et.pair_phased)
                                 & (et.PID == et.prev_entry.PID)
                                 & hl.is_defined(et.PID)))
    #agrees PID only if they are phased at all

    # define each categ
    et_has_PBT = et.filter(et.has_PBT)
    et_agrees_PBT = et.filter(et.agrees_PBT)
    et_phased = et.filter(et.pair_phased)
    et_phased_and_has_PBT = et_phased.filter(et_phased.has_PBT)
    et_phased_and_agrees_PBT = et_phased.filter(et_phased.agrees_PBT)
    et_same_PID = et.filter(et.agrees_PID)
    et_same_PID_and_has_PBT = et_same_PID.filter(et_same_PID.has_PBT)
    et_same_PID_and_agrees_PBT = et_same_PID.filter(et_same_PID.agrees_PBT)
    et_mnv = et.filter(et.is_mnv)
    et_mnv_and_has_PBT = et_mnv.filter(et_mnv.has_PBT)
    et_mnv_and_agrees_PBT = et_mnv.filter(et_mnv.agrees_PBT)
    print("Starting to aggregate \n" + tm.ctime())
    n_all = et.aggregate(hl.agg.counter(et.dist))
    n_has_PBT = et_has_PBT.aggregate(hl.agg.counter(et_has_PBT.dist))
    n_agrees_PBT = et_agrees_PBT.aggregate(hl.agg.counter(et_agrees_PBT.dist))
    n_phased = et_phased.aggregate(hl.agg.counter(et_phased.dist))
    n_phased_and_has_PBT = et_phased_and_has_PBT.aggregate(
        hl.agg.counter(et_phased_and_has_PBT.dist))
    n_phased_and_agrees_PBT = et_phased_and_agrees_PBT.aggregate(
        hl.agg.counter(et_phased_and_agrees_PBT.dist))
    n_same_PID = et_same_PID.aggregate(hl.agg.counter(et_same_PID.dist))
    n_same_PID_and_has_PBT = et_same_PID_and_has_PBT.aggregate(
        hl.agg.counter(et_same_PID_and_has_PBT.dist))
    n_same_PID_and_agrees_PBT = et_same_PID_and_agrees_PBT.aggregate(
        hl.agg.counter(et_same_PID_and_agrees_PBT.dist))
    n_mnv = et_mnv.aggregate(hl.agg.counter(et_mnv.dist))
    n_mnv_and_has_PBT = et_mnv_and_has_PBT.aggregate(
        hl.agg.counter(et_mnv_and_has_PBT.dist))
    n_mnv_and_agrees_PBT = et_mnv_and_agrees_PBT.aggregate(
        hl.agg.counter(et_mnv_and_agrees_PBT.dist))

    #also some missing: same PID and has PBT in general (not restricting to MNV) / those that agrees.
    #no we actually have it.

    print("Done aggregate \n" + tm.ctime())
    # and if we return these we are done
    df = pd.DataFrame(n_all, index=["n_all"])
    df2 = pd.DataFrame(n_has_PBT, index=["n_has_PBT"])
    df3 = pd.DataFrame(n_agrees_PBT, index=["n_agrees_PBT"])
    df4 = pd.DataFrame(n_phased, index=["n_phased"])
    df5 = pd.DataFrame(n_phased_and_has_PBT, index=["n_phased_and_has_PBT"])
    df6 = pd.DataFrame(n_phased_and_agrees_PBT,
                       index=["n_phased_and_agrees_PBT"])
    df7 = pd.DataFrame(n_mnv, index=["n_mnv"])
    df8 = pd.DataFrame(n_mnv_and_has_PBT, index=["n_mnv_and_has_PBT"])
    df9 = pd.DataFrame(n_mnv_and_agrees_PBT, index=["n_mnv_and_agrees_PBT"])
    df10 = pd.DataFrame(n_same_PID, index=["n_same_PID"])
    df11 = pd.DataFrame(n_same_PID_and_has_PBT,
                        index=["n_same_PID_and_has_PBT"])
    df12 = pd.DataFrame(n_same_PID_and_agrees_PBT,
                        index=["n_same_PID_and_agrees_PBT"])
    print(n_all)
    return (pd.concat(
        [df, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]))