def getJsnps(seq, genes, gene2pos): pos2snp = {} jfrag = seq.nuc[seq.jindex:].upper() jseq = genes[seq.jgene].upper() #Find where the jfrag is in the jseq: matches = re.finditer(jfrag, jseq) starts = [ m.start() for m in matches ] okstarts = [] pos = gene2pos[seq.jgene] + 3 for s in starts: e = s + len(jfrag) if s <= 15 and e >= 10 and e < pos: #the primer should be somewhere to the left (5'end) of the J gene and should cover somewhere after the 10 leftmost nucleotides extraseq = jseq[e :pos] cdr3len = len(seq.nuc) - seq.vindex + len(extraseq) if cdr3len%3 == 0: #inframe #Translate to cdr3: newNuc = seq.nuc + extraseq newaa = iseqlib.nt2aa( newNuc[seq.vindex:] ) if newaa[ len(newaa) -1 ] == 'F' and '*' not in newaa: if seq.aa not in newaa: sys.stderr.write('Warning: the new infered aa doesnot contain current aa\n') okstarts.append(s) if len(okstarts) > 1: sys.stderr.write("Looking for SNPs if any on the J fragment. However, mapped to multiple Js. Sequence: %s, %s, %s\n" %(seq.id, seq.nuc, seq.aa)) sys.exit(1) startPos = starts[0] endPos = min( [ startPos + len(jfrag), gene2pos[seq.jgene] + 3 ] ) if endPos > len(jseq): sys.stderr.write("Clone goes beyond downstream of J gene\n") refjfrag = jseq[startPos:endPos] seq.jrefstart = startPos return getSnps(jfrag, refjfrag, startPos)
def fillInSeq(seq, genes, gene2pos): '''The earlier version of adaptiveTCR tsv files did not always have the complete CDR3 sequences (because reads only support partial CDR3) This function looked up the original J gene sequences that the clone to mapped to, and fill in the its sequences to have complete CDR3. ''' jfrag = seq.nuc[seq.jindex:].upper() jseq = genes[seq.jgene].upper() pos = gene2pos[seq.jgene] + 3 #find where the jfrag is in the jseq: matches = re.finditer(jfrag, jseq) starts = [ m.start() for m in matches ] #Now finding all possible matches: fullseqs = [] newcdr3s = [] for s in starts: e = s + len(jfrag) if s <= 15 and e >= 10 and e < pos: #the primer should be somewhere to the left (5'end) of the J gene and should cover somewhere after the 10 leftmost nucleotides extraseq = jseq[e :pos] cdr3len = len(seq.nuc) - seq.vindex + len(extraseq) if cdr3len%3 == 0: #inframe #Translate to cdr3: newNuc = seq.nuc + extraseq newaa = iseqlib.nt2aa( newNuc[seq.vindex:] ) if newaa[ len(newaa) -1 ] == 'F' and '*' not in newaa: if seq.aa not in newaa: sys.stderr.write('Warning: the new infered aa doesnot contain current aa\n') fullseqs.append( newNuc ) newcdr3s.append( newaa ) if len(fullseqs) == 0: sys.stderr.write('Attempted to fill in the right side of the nuc and CDR3 sequences. Zero productive matches found. Sequence looked at was: %s, %s, %s\n' %(seq.id, seq.nuc, seq.aa)) sys.stderr.write("jfrag: %s, jseq: %s, matchesStarts: *%s*\n" %(jfrag, jseq, ','.join(starts)) ) return -1 elif len(fullseqs) > 1: sys.stderr.write('Attempted to fill in the right side of the nuc and CDR3 sequences. Multiple productive matches found - could not decide. Sequence looked at was: %s, %s, %s\n' %(seq.id, seq.nuc, seq.aa)) return -1 else: seq.nuc = fullseqs[0] seq.aa = newcdr3s[0] seq.jindex = seq.vindex + len(seq.aa)*3 seq.cdr3nuc = seq.nuc[seq.vindex:seq.jindex] seq.inframenuc = seq.nuc[seq.vindex%3: len(seq.nuc) - ((len(seq.nuc) - seq.vindex%3)%3)] seq.longaa = iseqlib.nt2aa( seq.inframenuc ) return 1
def __init__(self, name, seq, count, vs, js, freq): self.name = name self.seq = seq self.count = count self.freq = freq self.vs = sorted(vs) self.js = sorted(js) vstr = ','.join(vs) jstr = ','.join(js) self.aa = iseqlib.nt2aa(self.seq) #self.header = '|'.join([seq, vstr, jstr]) #header example: CASSLRRGGKPGELFF|TRBV5-4|TRBJ2-2 self.header = '|'.join([self.aa, vstr, jstr]) #header example: CASSLRRGGKPGELFF|TRBV5-4|TRBJ2-2
def __init__(self, name, seq, count, vs, js, freq, translate): self.name = name self.samples = name.split(',') self.seq = seq self.count = count self.freq = freq self.vs = sorted(vs) self.js = sorted(js) #vstr = ','.join(vs) #jstr = ','.join(js) vfams = getGeneFamilies(self.vs) #jfams = getGeneFamilies(self.js) vstr = ','.join(vfams) jstr = ','.join(js) if translate: seq = iseqlib.nt2aa(seq) self.header = '|'.join([seq, vstr, jstr]) #header example: CASSLRRGGKPGELFF|TRBV5|TRBJ2-2 (aminoacid|vfamilies|jgenes)
def __init__(self, line): items = line.strip().split('\t') if len(items) < 27: sys.stderr.write('Wrong tsv format. Expected 27 fields, only have %d\n%s\n' %(len(items), line)) sys.exit(1) self.id = items[0] self.nuc = items[2] self.aa = items[3] self.normFreq = -1.0 self.normCount = -1 if items[4] != '': self.normFreq = float(items[4]) if items[5] != '': self.normCount = int(items[5]) self.freq = float(items[6]) self.count = int(items[7]) self.cdr3len = int(items[8]) self.vfam = items[9] self.vgene = items[10] self.vties = items[11] vgenes = self.vgene.split('/') if len(vgenes) > 1: self.vgene = vgenes[0] vs = self.vties.split(', ') for v in vgenes[1:]: if v not in vs: vs.append(v) self.vties = ', '.join(vs) self.vgene = self.vgene.split('*')[0] self.dgene = items[12] self.jgene = items[13] self.jties = items[14] jgenes = self.jgene.split('/') if len(jgenes) > 1: self.jgene = jgenes[0] js = self.jties.split(', ') for j in jgenes[1:]: if j not in js: js.append(j) self.jties = ', '.join(js) self.jgene = self.jgene.split('*')[0] self.vdel = int(items[15]) self.d5del = int(items[16]) self.d3del = int(items[17]) self.jdel = int(items[18]) self.n2ins = int(items[19]) self.n1ins = int(items[20]) self.status = items[21] self.vindex = int(items[22]) self.n1index = int(items[23]) self.n2index = int(items[24]) self.dindex = int(items[25]) self.jindex = int(items[26]) self.cdr3nuc = self.nuc[self.vindex:self.jindex] self.inframenuc = self.nuc[ self.vindex%3: len(self.nuc) - ((len(self.nuc) - self.vindex%3)%3) ] self.longaa = iseqlib.nt2aa( self.inframenuc ) self.vpos2snp = None self.jpos2snp = None