def create_merged_seq(self): """Writes merged target-ordered query sequence""" align = pd.read_table(os.path.join(self.out_dir, "longest_segments.txt")) align.sort(columns=["target_name", "target_final_start"], inplace=True) out = open(os.path.join(self.out_dir, "merged_seq.fa"), "w") curr_target_name = "" i = 0 start = 0 for row in align.iterrows(): query = row[1]['query_name'] target = row[1]['target_name'] fa = FastaHack(self.query_fas[query]) if curr_target_name != target: if i > 0: out.write("\n") i = 1 out.write(">{0}\n".format(target)) curr_target_name = target fasub = fa.get_sequence(query) if row[1]['query_strand'] == "-": fasub = futil.reverse_complement(fasub) fasub1, start = futil.format_fasta(fasub, start) [out.write(x) for x in fasub1] ns = "N" * 1000 ns1, start = futil.format_fasta(ns, start) [out.write(x) for x in ns1] out.write("\n")
class FastaTestMore(FastaHackTest): def setUp(self): self.fa = FastaHack(FA) def test_query(self): self.assertEqual(self.fa.get_sub_sequence("1", 0, 4), "TAACC") self.assertEqual(self.fa.get_sequence("1:1-5"), "TAACC") self.assertEqual(self.fa["1:1-5"], "TAACC") self.assertEqual(self.fa["1:4-5"], "CC")
print >>stderr,"done " curr_chr=None curr_wnd_bin=0 for xi in xrange(len(regions_chrms)): chr=regions_chrms[xi] start,end= regions_coords[xi] wnds = regions_wnds[xi] #if chr!="chr20": continue if curr_chr!=chr: curr_chr=chr curr_wnd_bin=0 seq_str = (fa.get_sequence(curr_chr)[:]).upper() char_seq_str = np.array(seq_str,'c') GC = (char_seq_str=="G")|(char_seq_str=="C") notN = (char_seq_str!="N") print "chr %s total GC %f"%(curr_chr, np.sum(GC)/float(np.sum(notN))) csum_GC = np.cumsum(GC).astype(np.float) print "REGION",chr,start,end,wnds.shape, o.wnd_width bp_starts, bp_ends = wnds[:,0]+start, wnds[:,1]+start real_lens = bp_ends-bp_starts curr_GC = (csum_GC[bp_ends] - csum_GC[bp_starts])/real_lens #@regressions=g_data.get_regression(chr,start,end,wnds,mask,int(o.wnd_width)) ##l_regressions=regressions.shape[0] l=wnds.shape[0] print wnds.shape