Exemplo n.º 1
0
def sequenta_getaa(nuc):
    if re.match("N", nuc):
        assert len(nuc) >= 3
        aa = "X" + libcommon.nt2aa(nuc[3:])
    else:
        aa = libcommon.nt2aa(nuc)
    return aa
Exemplo n.º 2
0
    def run(self):
        events = []
        batchsize = 100000
        currbatch = 0
        items = self.clone.split('_')
        v = items[0]
        cdr3_aa = items[1]
        j = items[2]
        if self.devent.d3del == 0:
            d_cdr3_nt = self.d_nt[self.devent.d5del: ]
        else:
            d_cdr3_nt = self.d_nt[self.devent.d5del: -1 * self.devent.d3del]

        for vdel in os.listdir(self.vdir):
            vfile = os.path.join(self.vdir, vdel)
            vd_ins_nts = pickle.load(gzip.open(vfile, 'rb'))
            vdel = int(vdel)
            self.logToMaster("vdel: %d; vd_ins_nts: %d" % (vdel, len(vd_ins_nts)))
            v_cdr3_nt = self.v_nt if vdel == 0 else self.v_nt[: -1 * vdel]
            for jdel in os.listdir(self.jdir):
                jfile = os.path.join(self.jdir, jdel)
                dj_ins_nts = pickle.load(gzip.open(jfile, 'rb'))
                jdel = int(jdel)
                self.logToMaster("jdel: %d; dj_ins_nts: %d" % (jdel, len(dj_ins_nts)))
                j_cdr3_nt = self.j_nt if jdel == 0 else self.j_nt[jdel: ]
                
                for vd_ins in vd_ins_nts:
                    for dj_ins in dj_ins_nts:
                        cdr3_nt = (v_cdr3_nt + vd_ins[1: ] + d_cdr3_nt +
                                   dj_ins[: -1] + j_cdr3_nt)
                        if lcommon.nt2aa(cdr3_nt) != cdr3_aa:
                            print cdr3_nt
                            print lcommon.nt2aa(cdr3_nt)
                            print cdr3_aa
                        assert lcommon.nt2aa(cdr3_nt) == cdr3_aa
                        event = lclone.Cdr3Clone(1, cdr3_nt, v, j, d=self.d,
                              aa=cdr3_aa, vdel=vdel, jdel=jdel,
                              d5del=self.devent.d5del, d3del=self.devent.d3del,
                              vdins=vd_ins, djins = dj_ins)
                        events.append(event)
                        if len(events) >= batchsize:
                            outfile = "%s_%d" % (self.outfile, currbatch)
                            pickle.dump(events, gzip.open(outfile, 'wb'))
                            currbatch += 1
                            events = []
        if len(events) > 0:
            outfile = "%s_%d" % (self.outfile, currbatch)
            pickle.dump(events, gzip.open(outfile, 'wb'))
Exemplo n.º 3
0
def find_min_vdel(v_nt, cdr3_aa):
    # find the minimum number of V nucleotides need to be deleted to
    # result in the CDR3aa
    v_aa = lcommon.nt2aa(v_nt)
    matchseq = left_max_match(v_aa, cdr3_aa)  # max matched aa seq
    len_ntmatch = len(matchseq) * 3
    min_vdel = len(v_nt) - len_ntmatch
    # see if 1 or 2 of the next v nts can result in the next cdr3 aa
    if min_vdel > 0 and len(cdr3_aa) > len(matchseq):
        if min_vdel >= 2:
            v_nts = v_nt[len_ntmatch: len_ntmatch + 2]
        else:
            v_nts = v_nt[len_ntmatch]
        
        got2nts = False
        firstaa = cdr3_aa[len(matchseq)]
        codons = lcommon.aa2codons(firstaa)
        for codon in codons:
            if re.match(v_nts, codon):
                min_vdel -= len(v_nts)
                got2nts = True
                break
        if len(v_nts) == 2 and not got2nts:
            for codon in codons:
                if re.match(v_nts[0], codon):
                    min_vdel -= 1
                    break
    return min_vdel
Exemplo n.º 4
0
    def run(self):
        total_llh = 0.0
        for vjins in self.vjins_nts:
            assert len(vjins) >= 2  # because it has the last v and the first j
            for mobj in re.finditer(self.d_nts, vjins):
                # check special case
                v_hang_right = 0 if self.v_hang == 0 else 3 - self.v_hang
                if len(self.d_nts) == 3:
                    if (mobj.start() - v_hang_right - 1) % 3 == 0:
                        continue
                elif len(self.d_nts) == 4:
                    if (mobj.start() - v_hang_right - 1) != 1:
                        continue
                
                vdins = vjins[: mobj.start()]
                djins = vjins[mobj.end(): ]
                cdr3_nt = self.v_cdr3_nt + vjins[1:-1] + self.j_cdr3_nt
                assert lcommon.nt2aa(cdr3_nt) == self.cdr3_aa 
                event = lclone.Cdr3Clone(1, cdr3_nt, self.v, self.j, d=self.d,
                      aa=self.cdr3_aa, vdel=self.vdel, jdel=self.jdel,
                      d5del=self.d5del, d3del=self.d3del,
                      vdins=vdins, djins = djins)
                llh = ntclone_likelihood(event, self.model) 
                
                # addjust empty D llh (TEMPORARY HACK):
                if self.d_nts == '':
                    if self.d == 'TRBD1':
                        llh += log10(13)
                    elif self.d == 'TRBD2':
                        llh += log10(17)

                total_llh += 10 ** llh
        pickle.dump(total_llh, gzip.open(self.outfile, 'wb'))
Exemplo n.º 5
0
def find_min_jdel(j_nt, cdr3_aa):
    # find the minimum number of J nucleotides need to be deleted to
    # result in the CDR3aa
    hang_j = len(j_nt) % 3
    hang_j_nts = j_nt[: hang_j]  # the 5' of j nt seq that not part of a codon
    
    j_aa = lcommon.nt2aa(j_nt[hang_j:])
    matchseq = right_max_match(j_aa, cdr3_aa)
    len_ntmatch = len(matchseq) * 3
    min_jdel = len(j_nt) - len_ntmatch
    if min_jdel > 0 and len(cdr3_aa) > len(matchseq):
        if min_jdel >= 2:
            j_nts = j_nt[min_jdel - 2: min_jdel]
        else:
            j_nts = j_nt[min_jdel - 1]
        
        got2nts = False
        firstaa = cdr3_aa[len(cdr3_aa) - len(matchseq) - 1]
        codons = lcommon.aa2codons(firstaa)
        for codon in codons:
            if codon[-1 * len(j_nts): ] == j_nts:
                min_jdel -= len(j_nts)
                got2nts = True
                break
        if len(j_nts) == 2 and not got2nts:
            for codon in codons:
                if codon[2] == j_nts[1]:
                    min_jdel -= 1
                    break
    return min_jdel
Exemplo n.º 6
0
    def run(self):
        items = self.clone.split('_')
        v = items[0]
        j = items[2]
        events = []
        batchsize = 100000
        currbatch = 0

        for vdel in xrange(self.min_vdel, self.max_vdel + 1):
            v_cdr3_nt = self.v_nt if vdel == 0 else self.v_nt[: -1 * vdel]
            v_hang = len(v_cdr3_nt) % 3
            for jdel in xrange(self.min_jdel, self.max_jdel + 1):
                j_cdr3_nt = self.j_nt if jdel == 0 else self.j_nt[jdel: ]
                d_nts = self.devent.left_nts + self.devent.right_nts
                vjins_nts = get_vjins_emptyd(self.v_nt, vdel, self.j_nt, jdel,
                                             d_nts, self.cdr3_aa)
                if vjins_nts is None:
                    continue
                
                #self.logToMaster("Empty D: vdel: %d, jdel: %d, vjins: %d\n" % (vdel, jdel, len(vjins_nts)))

                for vjins in vjins_nts:
                    assert len(vjins) >= 2  # because it has the last v and the first j
                    for mobj in re.finditer(d_nts, vjins):
                        start = mobj.start()
                        end = mobj.end()
                        # check special case
                        v_hang_right = 0 if v_hang == 0 else 3 - v_hang
                        if len(d_nts) == 3:
                            if (start - v_hang_right - 1) % 3 == 0:
                                continue
                        elif len(d_nts) == 4:
                            if (start - v_hang_right - 1) != 1:
                                continue
                        
                        vdins = vjins[: start]
                        djins = vjins[end: ]
                        #assert len(vdins) + len(djins) + len(d_nts) == len(vjins)
                        cdr3_nt = v_cdr3_nt + vjins[1:-1] + j_cdr3_nt
                        assert lcommon.nt2aa(cdr3_nt) == self.cdr3_aa 
                        event = lclone.Cdr3Clone(1, cdr3_nt, v, j, d=self.d,
                              aa=self.cdr3_aa, vdel=vdel, jdel=jdel,
                              d5del=self.devent.d5del, d3del=self.devent.d3del,
                              vdins=vdins, djins=djins)
                        events.append(event)

                        if len(events) >= batchsize:
                            outfile = "%s_%d" % (self.outfile, currbatch)
                            pickle.dump(events, gzip.open(outfile, 'wb'))
                            currbatch += 1
                            events = []
        if len(events) > 0:
            outfile = "%s_%d" % (self.outfile, currbatch)
            pickle.dump(events, gzip.open(outfile, 'wb'))
Exemplo n.º 7
0
def find_devents2(d_nt, cdr3_aa, dels2freq):
    # find all possible (d5del, d3del) such that the resulted nt can be
    # translated to a subsequence of cdr3_aa
    devents = []  
    dlen = len(d_nt)
    for d5del in xrange(dlen + 1):
        for d3del in xrange(dlen - d5del):
            dels = (d5del, d3del)
            if dels not in dels2freq or dels2freq[dels] == 0:
                continue

            d_leftover_nt = d_nt[d5del: dlen - d3del]
            dlen2 = len(d_leftover_nt)
            if dlen2 == 0:
                if d5del == dlen / 2:
                    event = Devent(d5del, d3del, '', -1, -1, '')
                    devents.append(event)
            elif dlen2 < 3:
                event = Devent(d5del, d3del, d_leftover_nt, -1, -1, '')
                devents.append(event)
                #for left in xrange(dlen2 + 1):
                #    left_nts = d_leftover_nt[: left]
                #    right_nts = d_leftover_nt[left: ]
                #    event = Devent(d5del, d3del, left_nts, -1, -1, right_nts)
                #    devents.append(event)
            else:
                for left in [0, 1, 2]:  # each translation frame
                    d_aa = lcommon.nt2aa(d_leftover_nt[left:])
                    # check to see if d_aa match cdr3_aa
                    if d_aa:  # not empty sequence
                        matches = find_dmatches(d_aa, cdr3_aa)
                        if len(matches) > 0:
                            right = left + len(d_aa) * 3
                            left_nts = d_leftover_nt[:left]
                            right_nts = d_leftover_nt[right: ]
                            for match in matches:
                                event = Devent(d5del, d3del, left_nts, match[0],
                                               match[1], right_nts)
                                devents.append(event)
                if len(d_leftover_nt) in [3, 4]:
                    event = Devent(d5del, d3del, d_leftover_nt, -1, -1, '')
                    devents.append(event)
    return devents
Exemplo n.º 8
0
 def run(self):
     #events = []
     #event_file = "%s_events" % self.outfile
     total_llh = 0.0
     model = pickle.load(gzip.open(self.modelfile, 'rb'))
     vd_ins_nts = pickle.load(gzip.open(self.vfile, 'rb'))
     dj_ins_nts = pickle.load(gzip.open(self.jfile, 'rb'))
     #self.logToMaster("vdel: %d; vd_ins_nts: %d" % (self.vdel, len(vd_ins_nts)))
     #self.logToMaster("jdel: %d; dj_ins_nts: %d" % (self.jdel, len(dj_ins_nts)))
             
     for vd_ins in vd_ins_nts:
         for dj_ins in dj_ins_nts:
             cdr3_nt = (self.v_cdr3_nt + vd_ins[1: ] + self.d_cdr3_nt +
                        dj_ins[: -1] + self.j_cdr3_nt)
             assert lcommon.nt2aa(cdr3_nt) == self.cdr3_aa
             event = lclone.Cdr3Clone(1, cdr3_nt, self.v, self.j, d=self.d,
                   aa=self.cdr3_aa, vdel=self.vdel, jdel=self.jdel,
                   d5del=self.d5del, d3del=self.d3del,
                   vdins=vd_ins, djins = dj_ins)
             #events.append(event)
             llh = ntclone_likelihood(event, model) 
             total_llh += 10 ** llh
     pickle.dump(total_llh, gzip.open(self.outfile, 'wb'))
Exemplo n.º 9
0
def adaptive_parseline(line, index2col):
    items = line.strip("\n").split("\t")
    if len(items) != len(index2col):
        sys.stderr.write(
            "Incosistent number of columns between the following\
                          line and the header line, skipped it:\n\
                          Line:\n%s\n"
            % line
        )
        return None

    col2val = {}
    valid_cols = adaptive_columns()
    for i, col in index2col.iteritems():
        if col in valid_cols:
            col2val[col] = items[i].replace("/", ", ")

    # Return None if line does not have minimum required fields.
    required_cols = [
        "count",
        "frequencyCount",
        "nucleotide",
        "vGeneName",
        "jGeneName",
        "vGeneNameTies",
        "jGeneNameTies",
    ]
    for c in required_cols:
        if c not in col2val:  # or col2val[c] in ["(undefined)", ""]:
            return None

    count = int(col2val["count"])
    freq = float(col2val["frequencyCount"]) / 100.0  # convert to non percentage
    nuc = col2val["nucleotide"]
    vgene = col2val["vGeneName"]
    if vgene == "unresolved":
        vgenes = col2val["vGeneNameTies"].split(",")
    else:
        vgenes = [vgene]
    jgene = col2val["jGeneName"]
    if jgene == "unresolved":
        jgenes = col2val["jGeneNameTies"].split(",")
    else:
        jgenes = [jgene]

    # Clone with required fields
    clone = Clone(count, freq, nuc, vgenes, jgenes)

    # Additional information if available
    # Gene info:
    if "dGeneName" in col2val:
        dgenestr = col2val["dGeneName"]
        if dgenestr == "unresolved":
            clone.dgenes = col2val["dGeneNameTies"].split(",")
        else:
            clone.dgenes = [dgenestr]

    if "sequenceStatus" in col2val:
        status = col2val["sequenceStatus"].lower()
        if status is not None and status == "in":
            clone.productive = True
        elif status == "out" or status == "stop":
            clone.productive = False
        else:
            sys.stderr.write("Unknown status: %s\n" % status)
    if "aminoAcid" in col2val:
        clone.cdr3aa = col2val["aminoAcid"]

    # Junctional info:
    offset = 0
    if "vIndex" in col2val:
        vindex = int(col2val["vIndex"])
        if clone.productive:
            # Make sure nuc is inframe:
            offset = vindex % 3
            nuclen = len(clone.nuc)
            endoffset = (nuclen - offset) % 3
            clone.nuc = clone.nuc[offset : nuclen - endoffset]
            clone.aa = libcommon.nt2aa(clone.nuc)
        if clone.cdr3aa:
            cdr3len = len(clone.cdr3aa) * 3
            endindex = max(len(clone.nuc), vindex + cdr3len)
            clone.cdr3nuc = clone.nuc[vindex:endindex]
    if "dIndex" in col2val:
        clone.firstdpos = int(col2val["dIndex"]) - offset
    if "n2Index" in col2val:
        n2index = int(col2val["n2Index"])
        if n2index != -1:
            clone.lastvpos = n2index - 1 - offset
        elif clone.firstdpos:  # No d5ins
            clone.lastvpos = clone.firstdpos - 1
    if "jIndex" in col2val:
        clone.firstjpos = int(col2val["jIndex"]) - offset
    if "n1Index" in col2val:
        n1index = int(col2val["n1Index"])
        if n1index != -1:
            clone.lastdpos = n1index - 1 - offset
        elif clone.firstjpos:  # No d3ins
            clone.lastdpos = clone.firstjpos - 1

    # Deletion info:
    if "vDeletion" in col2val:
        clone.vdel = int(col2val["vDeletion"])
    if "d5Deletion" in col2val:
        clone.d5del = int(col2val["d5Deletion"])
    if "d3Deletion" in col2val:
        clone.d3del = int(col2val["d3Deletion"])
    if "jDeletion" in col2val:
        clone.jdel = int(col2val["jDeletion"])

    return clone