def convert_colorspace(color_seq, char_a, char_b): """ take a colorspace read, convert to base sequence convert C to T (char_a to char_b) then back to colorspace and return """ base_seq = cs2seq(color_seq.rstrip()).replace(char_a, char_b) return seq2cs(base_seq)
def parse_sam(sam_iter, chr_lengths, get_records, unmapped_name, is_colorspace, out_sam): is_colorspace = int(is_colorspace) unmapped = open(unmapped_name, "w") print >>sys.stderr, "writing unmapped reads to %s" % (unmapped.name, ) idx = 0 write_new_header(out_sam, chr_lengths) for sline in sam_iter: # comment. if sline[0] == "@": copy_header(out_sam, sline, chr_lengths) continue line = sline.split("\t") read_id = line[0] sam_flag = int(line[1]) # no reported alignments. # extra via -m if sam_flag == 4: if not "XM:i:0" in sline: # write stuff that was excluded because of too many mappings. raw_fastq, converted_fastq = get_records(read_id, 0) print >> unmapped, str(raw_fastq) continue # extra found via -M if line[4] == '0' and sam_flag == 0: raw_fastq, converted_fastq = get_records(read_id, 0) print >> unmapped, str(raw_fastq) continue if sam_flag != 0: # if the pair doesn't map to same place, skip. if line[6] != "=": continue # flags are (1 | 2 | 32 | 64) or (1 | 2 | 16 | 128) idx = 0 if (sam_flag & 128) == 0 else 1 # bowtie prints the alignment without the pair end info. # add back /0 or /1 here. read_id = read_id + "/" + str(idx + 1) seqid = line[2] direction = seqid[0] assert direction in 'fr' seqid = seqid[1:] line[2] = seqid pos0 = int(line[3]) - 1 if is_colorspace: pos0 -= 2 converted_seq = line[9] # we want to include the orginal, non converted reads # in the output file to view the alignment. # read_id is the line in the file. #fh_raw_reads.seek((read_id * read_len) + read_id) #raw_seq = fh_raw_reads.read(read_len) raw_fastq, converted_fastq = get_records(read_id, idx) read_len = len(converted_seq) + 3 * int(is_colorspace) raw_seq = raw_fastq.seq if is_colorspace: raw_seq = cs2seq(raw_seq) if direction == 'f': line[9] = raw_seq else: pos0 = chr_lengths[seqid] - pos0 - read_len # adjust mate position as well. mpos = int(line[7]) mpos = chr_lengths[seqid] - mpos - read_len + 2 line[8] = str(pos0 - mpos + 1) # insert size line[7] = str(mpos) line[3] = str(pos0 + 1) # since the read matched the flipped genome. we flip it here. line[9] = raw_seq = revcomp(raw_seq) # flip the quality as well. line[10] = line[10][::-1] line[1] = str(sam_flag + 16) # alignment on reverse strand. converted_seq = revcomp(converted_fastq.seq) if (sam_flag & 128 != 0): # th other end of the pair. line[9] = raw_seq = revcomp(raw_seq) converted_seq = revcomp(converted_seq) # NM:i:2 NM = [x for x in line[11:] if x[0] == 'N' and x[1] == 'M'][0].rstrip() nmiss = int(NM[-1]) line[-1] = line[-1].rstrip() yield dict( read_id=read_id, seqid=line[2], pos0=pos0, mapq=line[4], nmiss=nmiss, read_sequence=converted_seq, raw_read=raw_seq, ), line, read_len, direction
def parse_sam(sam_iter, chr_lengths, get_records, unmapped_name, is_colorspace, out_sam): is_colorspace = int(is_colorspace) unmapped = open(unmapped_name, "w") print >> sys.stderr, "writing unmapped reads to %s" % (unmapped.name, ) idx = 0 write_new_header(out_sam, chr_lengths) for sline in sam_iter: # comment. if sline[0] == "@": copy_header(out_sam, sline, chr_lengths) continue line = sline.split("\t") read_id = line[0] sam_flag = int(line[1]) # no reported alignments. # extra via -m if sam_flag == 4: if not "XM:i:0" in sline: # write stuff that was excluded because of too many mappings. raw_fastq, converted_fastq = get_records(read_id, 0) print >> unmapped, str(raw_fastq) continue # extra found via -M if line[4] == '0' and sam_flag == 0: raw_fastq, converted_fastq = get_records(read_id, 0) print >> unmapped, str(raw_fastq) continue if sam_flag != 0: # if the pair doesn't map to same place, skip. if line[6] != "=": continue # flags are (1 | 2 | 32 | 64) or (1 | 2 | 16 | 128) idx = 0 if (sam_flag & 128) == 0 else 1 # bowtie prints the alignment without the pair end info. # add back /0 or /1 here. read_id = read_id + "/" + str(idx + 1) seqid = line[2] direction = seqid[0] assert direction in 'fr' seqid = seqid[1:] line[2] = seqid pos0 = int(line[3]) - 1 if is_colorspace: pos0 -= 2 converted_seq = line[9] # we want to include the orginal, non converted reads # in the output file to view the alignment. # read_id is the line in the file. #fh_raw_reads.seek((read_id * read_len) + read_id) #raw_seq = fh_raw_reads.read(read_len) raw_fastq, converted_fastq = get_records(read_id, idx) read_len = len(converted_seq) + 3 * int(is_colorspace) raw_seq = raw_fastq.seq if is_colorspace: raw_seq = cs2seq(raw_seq) if direction == 'f': line[9] = raw_seq else: pos0 = chr_lengths[seqid] - pos0 - read_len # adjust mate position as well. mpos = int(line[7]) mpos = chr_lengths[seqid] - mpos - read_len + 2 line[8] = str(pos0 - mpos + 1) # insert size line[7] = str(mpos) line[3] = str(pos0 + 1) # since the read matched the flipped genome. we flip it here. line[9] = raw_seq = revcomp(raw_seq) # flip the quality as well. line[10] = line[10][::-1] line[1] = str(sam_flag + 16) # alignment on reverse strand. converted_seq = revcomp(converted_fastq.seq) if (sam_flag & 128 != 0): # th other end of the pair. line[9] = raw_seq = revcomp(raw_seq) converted_seq = revcomp(converted_seq) # NM:i:2 NM = [x for x in line[11:] if x[0] == 'N' and x[1] == 'M'][0].rstrip() nmiss = int(NM[-1]) line[-1] = line[-1].rstrip() yield dict( read_id=read_id, seqid=line[2], pos0=pos0, mapq=line[4], nmiss=nmiss, read_sequence=converted_seq, raw_read=raw_seq, ), line, read_len, direction