Exemplo n.º 1
0
    def create_merged_seq(self):
        """Writes merged target-ordered query sequence"""
        align = pd.read_table(os.path.join(self.out_dir, "longest_segments.txt"))
        align.sort(columns=["target_name", "target_final_start"], inplace=True)

        out = open(os.path.join(self.out_dir, "merged_seq.fa"), "w")

        curr_target_name = ""
        i = 0
        start = 0
        for row in align.iterrows():
            query = row[1]['query_name']
            target = row[1]['target_name']
            fa = FastaHack(self.query_fas[query])

            if curr_target_name != target:
                if i > 0:
                    out.write("\n")
                    i = 1
                out.write(">{0}\n".format(target))
                curr_target_name = target

            fasub = fa.get_sequence(query)
            if row[1]['query_strand'] == "-":
                fasub = futil.reverse_complement(fasub)
            fasub1, start = futil.format_fasta(fasub, start)
            [out.write(x) for x in fasub1]

            ns = "N" * 1000
            ns1, start = futil.format_fasta(ns, start)
            [out.write(x) for x in ns1]

        out.write("\n")
Exemplo n.º 2
0
class FastaTestMore(FastaHackTest):
    def setUp(self):
        self.fa = FastaHack(FA)

    def test_query(self):
        self.assertEqual(self.fa.get_sub_sequence("1", 0, 4), "TAACC")
        self.assertEqual(self.fa.get_sequence("1:1-5"), "TAACC")
        self.assertEqual(self.fa["1:1-5"], "TAACC")
        self.assertEqual(self.fa["1:4-5"], "CC")
    print >>stderr,"done "

    curr_chr=None
    curr_wnd_bin=0

    for xi in xrange(len(regions_chrms)):
        chr=regions_chrms[xi]
        start,end= regions_coords[xi]
        wnds = regions_wnds[xi]
        
        #if chr!="chr20": continue

        if curr_chr!=chr:
            curr_chr=chr
            curr_wnd_bin=0
            seq_str = (fa.get_sequence(curr_chr)[:]).upper()
            char_seq_str = np.array(seq_str,'c')
            GC = (char_seq_str=="G")|(char_seq_str=="C")
            notN = (char_seq_str!="N")
            print "chr %s total GC %f"%(curr_chr, np.sum(GC)/float(np.sum(notN)))
            csum_GC = np.cumsum(GC).astype(np.float) 

        print "REGION",chr,start,end,wnds.shape, o.wnd_width
        bp_starts, bp_ends = wnds[:,0]+start, wnds[:,1]+start
        real_lens = bp_ends-bp_starts
        curr_GC = (csum_GC[bp_ends] - csum_GC[bp_starts])/real_lens

        #@regressions=g_data.get_regression(chr,start,end,wnds,mask,int(o.wnd_width))
        ##l_regressions=regressions.shape[0]
        l=wnds.shape[0]
        print wnds.shape