def genrefblks(readseq, chrom, start, stop, strand, cigar, nreads): refpos = start readpos = 0 if strand == '-': readseq = reverse_complement(readseq) tleftlim, trightlim = start + ENDTRIM, stop - ENDTRIM qleftlim, qrightlim = ENDTRIM, len(readseq) - ENDTRIM #print '===============' #print readseq, chrom, start, stop, strand, cigar, nreads if transcriptome: print 'P', nreads, chrom, strand, tleftlim, trightlim cigarcommands = cigar_pattern.findall(cigar) if cigarcommands[0][ 1] == 'S': # shift start site for the first soft clipping start -= int(cigarcommands[0][0]) if len(cigarcommands ) > 1 and cigarcommands[-1][1] == 'S': # last soft clipping stop += int(cigarcommands[-1][0]) for num, cmd in cigarcommands: num = int(num) if cmd == 'M': # match mleft = max(qleftlim, readpos) mright = min(qrightlim, readpos + num) if mleft < mright: seq = readseq[mleft:mright] print 'M', nreads, chrom, strand, max(refpos, tleftlim), seq refpos += num readpos += num elif cmd == 'S': # soft clip readpos += num elif cmd == 'N': # skip refpos += num elif cmd == 'D': # deletion if tleftlim <= refpos < trightlim: print 'D', nreads, chrom, strand, refpos, num refpos += num elif cmd == 'I': # insertion ppos = (refpos if strand == '+' else (refpos - 1)) if tleftlim <= ppos < trightlim: print 'I', nreads, chrom, strand, ppos, num readpos += num elif cmd == 'H': # hard clipping pass else: print 'E', nreads, num, cmd, readseq raise ValueError if strand == '+': fivep, threep = start, stop - 1 else: fivep, threep = stop - 1, start print '5', nreads, chrom, strand, fivep print '3', nreads, chrom, strand, threep
def genrefblks(readseq, chrom, start, stop, strand, cigar, nreads): refpos = start readpos = 0 if strand == '-': readseq = reverse_complement(readseq) tleftlim, trightlim = start + ENDTRIM, stop - ENDTRIM qleftlim, qrightlim = ENDTRIM, len(readseq) - ENDTRIM #print '===============' #print readseq, chrom, start, stop, strand, cigar, nreads if transcriptome: print 'P', nreads, chrom, strand, tleftlim, trightlim cigarcommands = cigar_pattern.findall(cigar) if cigarcommands[0][1] == 'S': # shift start site for the first soft clipping start -= int(cigarcommands[0][0]) if len(cigarcommands) > 1 and cigarcommands[-1][1] == 'S': # last soft clipping stop += int(cigarcommands[-1][0]) for num, cmd in cigarcommands: num = int(num) if cmd == 'M': # match mleft = max(qleftlim, readpos) mright = min(qrightlim, readpos + num) if mleft < mright: seq = readseq[mleft:mright] print 'M', nreads, chrom, strand, max(refpos, tleftlim), seq refpos += num readpos += num elif cmd == 'S': # soft clip readpos += num elif cmd == 'N': # skip refpos += num elif cmd == 'D': # deletion if tleftlim <= refpos < trightlim: print 'D', nreads, chrom, strand, refpos, num refpos += num elif cmd == 'I': # insertion ppos = (refpos if strand == '+' else (refpos-1)) if tleftlim <= ppos < trightlim: print 'I', nreads, chrom, strand, ppos, num readpos += num elif cmd == 'H': # hard clipping pass else: print 'E', nreads, num, cmd, readseq raise ValueError if strand == '+': fivep, threep = start, stop-1 else: fivep, threep = stop-1, start print '5', nreads, chrom, strand, fivep print '3', nreads, chrom, strand, threep
def process(nrlist): nucleotidetracks = [rseqarr.TRACKS.index(i) for i in 'ACGT'] for nracc in nrlist: dbinfo = refFlat[nracc] chrom = dbinfo['chrom'] genomeseq = ''.join( mm9.get(chrom, blkstart, blkend) for blkstart, blkend in dbinfo['exonBlocks']).upper() if dbinfo['strand'] == '-': utr3, utr5 = 'leftUtrBlocks', 'rightUtrBlocks' else: utr5, utr3 = 'leftUtrBlocks', 'rightUtrBlocks' if nracc.startswith('NM_'): utr5length = blklength(dbinfo[utr5]) cdslength = blklength(dbinfo['cdsBlocks']) utr3length = blklength(dbinfo[utr3]) else: exonlength = blklength(dbinfo['exonBlocks']) cntarray = rseqarr.get_blocks(dbinfo['chrom'], dbinfo['exonBlocks'], dbinfo['strand'])[nucleotidetracks] depthcnt = np.array(cntarray.sum(0).clip(1), 'd') confidentcalls = ((cntarray / depthcnt >= MINPERCENTTOCALL) * (depthcnt >= MINREADSTOCALL)) mutatedseq = list(genomeseq) for base, calls in zip('ACGT', confidentcalls): for pos in np.where(calls)[0]: mutatedseq[pos] = base mutatedseq = ''.join(mutatedseq) if dbinfo['strand'] == '-': mutatedseq = reverse_complement(mutatedseq) if nracc.startswith('NM_'): print >> bedout, '\t'.join([ nracc, str(utr5length), str(utr5length + cdslength), '%s' % dbinfo['geneName'], '.', '+' ]) else: print >> bedout, '\t'.join([ nracc, '0', str(exonlength), '%s' % dbinfo['geneName'], '.', '+' ]) print >> faout, '>%s %s' % (nracc, dbinfo['geneName']) faout.write(textwrap(mutatedseq))
def process(nrlist): nucleotidetracks = [rseqarr.TRACKS.index(i) for i in 'ACGT'] for nracc in nrlist: dbinfo = refFlat[nracc] chrom = dbinfo['chrom'] genomeseq = ''.join(mm9.get(chrom, blkstart, blkend) for blkstart, blkend in dbinfo['exonBlocks']).upper() if dbinfo['strand'] == '-': utr3, utr5 = 'leftUtrBlocks', 'rightUtrBlocks' else: utr5, utr3 = 'leftUtrBlocks', 'rightUtrBlocks' if nracc.startswith('NM_'): utr5length = blklength(dbinfo[utr5]) cdslength = blklength(dbinfo['cdsBlocks']) utr3length = blklength(dbinfo[utr3]) else: exonlength = blklength(dbinfo['exonBlocks']) cntarray = rseqarr.get_blocks(dbinfo['chrom'], dbinfo['exonBlocks'], dbinfo['strand'])[nucleotidetracks] depthcnt = np.array(cntarray.sum(0).clip(1), 'd') confidentcalls = ((cntarray/depthcnt >= MINPERCENTTOCALL) * (depthcnt >= MINREADSTOCALL)) mutatedseq = list(genomeseq) for base, calls in zip('ACGT', confidentcalls): for pos in np.where(calls)[0]: mutatedseq[pos] = base mutatedseq = ''.join(mutatedseq) if dbinfo['strand'] == '-': mutatedseq = reverse_complement(mutatedseq) if nracc.startswith('NM_'): print >> bedout, '\t'.join([nracc, str(utr5length), str(utr5length + cdslength), '%s' % dbinfo['geneName'], '.', '+']) else: print >> bedout, '\t'.join([nracc, '0', str(exonlength), '%s' % dbinfo['geneName'], '.', '+']) print >> faout, '>%s %s' % (nracc, dbinfo['geneName']) faout.write(textwrap(mutatedseq))
def iteralignments(self, strands='+-', withref=False): geteditdist= lambda x: x[4] for line in self.samfile: fields = line[:-1].split('\t') if line[0] == '@': if line[:3] != '@SQ': continue sqname = fields[1][3:] sqlen = [int(fl[3:]) for fl in fields[2:] if fl[:3] == 'LN:'][0] self.seqlen[sqname] = sqlen continue qname = fields[0] flags = int(fields[1]) rname = fields[2] pos = int(fields[3]) # 1-based leftmost mapq = int(fields[4]) # phred-scaled cigar = fields[5] seq = fields[9] options = dict(v.split(':', 1) for v in fields[11:]) if flags & F_REVERSE_STRAND: strand = '-' seq = reverse_complement(seq) else: strand = '+' editdist = int(options.get('NM', 'i:-1')[2:]) if rname == '*' or strand not in strands: mapped = [] else: reflen, _ = calculate_cigar_length(cigar) stop = pos + reflen - 1 start = pos - 1 if self.zerobase else pos mapped = [(rname, start, stop, strand, editdist, cigar)] for altmatch in options.get('XA', 'Z:')[2:].split(';')[:-1]: altfields = altmatch.split(',') strand = altfields[1][0] pos = int(altfields[1][1:]) rname = altfields[0] cigar = altfields[2] editdist = int(altfields[3]) reflen, _ = calculate_cigar_length(cigar) stop = pos + reflen - 1 start = pos - 1 if self.zerobase else pos if strand in strands: mapped.append((rname, start, stop, strand, editdist, cigar)) # search for alternative reads for altline in self.samfile: altfields = altline[:-1].split('\t') altqname = altfields[0] altflags = int(altfields[1]) if altqname != qname: self.samfile.push(altline) break altrname = altfields[2] altpos = int(altfields[3]) # 1-based leftmost altmapq = int(altfields[4]) # phred-scaled altcigar = altfields[5] altseq = altfields[9] altoptions = dict(v.split(':', 1) for v in altfields[11:]) if altflags & F_REVERSE_STRAND: altstrand = '-' altseq = reverse_complement(altseq) else: altstrand = '+' alteditdist = int(altoptions.get('NM', 'i:-1')[2:]) if altrname != '*' and altstrand in strands: altreflen, _ = calculate_cigar_length(altcigar) altstop = altpos + altreflen - 1 altstart = altpos - 1 if self.zerobase else altpos mapped.append((altrname, altstart, altstop, altstrand, alteditdist, altcigar)) mapped.sort(key=geteditdist) if withref: newmapped = [] for m in mapped: subseq = self.getsubseq(m[0], m[1]-1, m[2]) if m[3] == '-': subseq = reverse_complement(subseq) newmapped.append(m + (subseq,)) mapped = newmapped yield { 'qname': qname, 'flags': flags, 'mapq': mapq, 'seq': seq, 'options': options, 'mapped': mapped, # positions are 1-based }