def test_count(self): count = _update_conversions(self.ref, self.aln, 0, "CT", self.cc, self.tt, 10, len(self.ref), "") self.assertEquals(count, 0, (count)) count = _update_conversions(self.ref, self.aln, 0, "CT", self.cc, self.tt, 0, len(self.ref), "") self.assertEquals(count, 1, (count))
def test_gas(self): _update_conversions(self.ref, self.aln, 0, "GA", self.cc, self.tt, 10, len(self.ref), False) for c, t, r, a in zip(self.cc, self.tt, self.ref, self.aln): if c != 0: self.assertEquals(r, "G") self.assertEquals(a, "G") if t != 0: self.assertEquals(r, "G") self.assertEquals(a, "A")
def test_cts(self): _update_conversions(self.ref, self.aln, 0, "CT", self.cc, self.tt, 10, len(self.ref), False) self.assertEquals(self.cc.tolist(), [0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L]) self.assertEquals(self.tt.tolist(), [0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L]) _update_conversions(self.ref, self.aln, 0, "CT", self.cc, self.tt, 10, len(self.ref), False) self.assertEquals(self.cc.tolist(), [0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L]) self.assertEquals(self.tt.tolist(), [0L, 2L, 2L, 0L, 0L, 0L, 0L, 0L])
def test_gas(self): _update_conversions(self.ref, self.aln, 0, "GA", self.cc, self.tt, 10, len(self.ref), "") for c, t, r, a in zip(self.cc, self.tt, self.ref, self.aln): if c != 0: self.assertEquals(r, "G") self.assertEquals(a, "G") if t != 0: self.assertEquals(r, "G") self.assertEquals(a, "A")
def test_cts(self): _update_conversions(self.ref, self.aln, 0, "CT", self.cc, self.tt, 10, len(self.ref), "") self.assertEquals(self.cc.tolist(), [0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L]) self.assertEquals(self.tt.tolist(), [0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L]) _update_conversions(self.ref, self.aln, 0, "CT", self.cc, self.tt, 10, len(self.ref), "") self.assertEquals(self.cc.tolist(), [0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L]) self.assertEquals(self.tt.tolist(), [0L, 2L, 2L, 0L, 0L, 0L, 0L, 0L])
def test_count(self): count = _update_conversions(self.ref, self.aln, 0, "CT", self.cc, self.tt, 10, len(self.ref), False) self.assertEquals(count, 0, (count)) count = _update_conversions(self.ref, self.aln, 0, "CT", self.cc, self.tt, 0, len(self.ref), False) self.assertEquals(count, 1, (count))
def parse_gsnap_sam(gsnap_f, ref_path, out_dir, paired_end): fa = Fasta(ref_path) fc, ft, fmethyltype = \ bin_paths_from_fasta(fa.fasta_name, out_dir) counts = get_counts(fc, ft, fa) chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys()) print >>sys.stderr, "tabulating methylation" gsnap_subset = open(gsnap_f.replace(".gsnap.sam", ".sam"), "w") for sline in open(gsnap_f): if sline.startswith("@SQ"): print >>gsnap_subset, sline.strip() continue # the ends didn't map to same spot. line = sline.split("\t") sam_flag = int(line[1]) if paired_end: if line[6] != "=": continue #print >>gsnap_subset, sline.strip() else: # no reported alignments. if sam_flag == 4: continue print >>gsnap_subset, sline.rstrip("\n") seqid = line[2] aln_seq = line[9] read_length = len(aln_seq) bp0 = int(line[3]) - 1 ga = ((sam_flag & 16) != 0) ^ (sam_flag & 128 != 0) insert_length = int(line[8]) #line[9] = aln_seq #line[10] = line[10][:len(aln_seq)] # both ends start at exactly the same place. if insert_length == 0: continue # handle overlapping reads. one side has + insert, the other is - if -read_length < insert_length < 0: insert_length = abs(insert_length) aln_seq = aln_seq[:-(read_length - insert_length)] read_length = len(aln_seq) if line[7] == '0': continue bp1 = bp0 + read_length ref_seq = (fa[seqid][bp0:bp1]).upper() letters = 'GA' if ga else 'CT' read_length = len(ref_seq) assert read_length > 0, (bp0, bp1) _update_conversions(ref_seq, aln_seq, bp0, letters, counts[seqid]['c'], counts[seqid]['t'], 50, read_length, line[5]) write_files(fa.fasta_name, out_dir, counts) cmd = open(out_dir +"/cmd.ran", "w") import datetime print >>cmd, "#date:", str(datetime.date.today()) print >>cmd, "#path:", op.abspath(".") print >>cmd, " ".join(sys.argv) write_sam_commands(out_dir, fa, "methylcoder.gsnap")