Exemplo n.º 1
0
 def __init__(self, l):
     self.id = l['id']
     self.organism = l['organism']
     self.chain = l['chain']
     self.region = l['region']
     self.nucseq = l['nucseq']
     self.alseq = l['aligned_protseq']
     self.cdrs = l['cdrs'].split(cdrs_sep) if l['cdrs'] else []
     ## these are still 1-indexed !!!!!!!!!!!!!!
     self.cdr_columns = [
         map(int, x.split('-')) for x in l['cdr_columns'].split(cdrs_sep)
     ] if self.cdrs else []
     frame = l['frame']
     assert frame in ['+1', '+2', '+3', '1', '2', '3']
     self.nucseq_offset = int(
         frame[-1]) - 1  ## 0, 1 or 2 (0-indexed for python)
     self.protseq = translation.get_translation(self.nucseq, frame)[0]
     assert self.protseq == self.alseq.replace(gap_character, '')
     # sanity check
     if self.cdrs:
         assert self.cdrs == [
             self.alseq[x[0] - 1:x[1]] for x in self.cdr_columns
         ]
Exemplo n.º 2
0
                    id = ''
                    for line in open(fastafile, 'r'):
                        if line[0] == '>':
                            id = line[1:-1]
                            myfasta[np][id] = ''
                        else:
                            assert id
                            myfasta[np][id] += line[:-1]
                all_fasta_sd[organism][ab][vj] = myfasta
                for id in myfasta[prot]:
                    assert id in myfasta[nuc]
                    pseq = myfasta[prot][id]
                    nseq = myfasta[nuc][id]
                    myframe = -1
                    for i in range(3):
                        tseq = get_translation(nseq, '+{}'.format(i + 1))[0]
                        if pseq in tseq:
                            myframe = i + 3 * tseq.index(pseq)
                    assert myframe >= 0
                    num_after = len(nseq) - 3 * len(pseq) - myframe
                    all_offsets[organism][ab][vj][id] = (myframe, num_after)

    ## make a single tsv file with the following fields
    ##
    ## id organism region
    ## chain is A or B -- where A means alpha-like (VJ recombining) and B means beta-like (VDJ recombining)
    ## region is V D J
    ##
    ## cdrs: comma-separated list of protein sequences for cdr regions
    ##
    outfields = "id organism chain region nucseq frame aligned_protseq cdr_columns cdrs".split(
Exemplo n.º 3
0
def beta_cdr3_protseq_probability(theid,
                                  organism,
                                  v_gene,
                                  j_gene,
                                  cdr3_protseq,
                                  cdr3_nucseq='',
                                  error_threshold=0.05,
                                  verbose=False,
                                  allow_early_nucseq_mismatches=True,
                                  return_final_cdr3_nucseq=False):
    nucleotide_match = (cdr3_nucseq != '')
    if nucleotide_match:
        assert not cdr3_protseq
        cdr3_protseq = translation.get_translation(cdr3_nucseq, '+1')[0]
        assert len(cdr3_nucseq) == 3 * len(cdr3_protseq)

    ab = 'B'
    assert all_genes[organism][v_gene].chain == ab

    v_nucseq = get_v_cdr3_nucseq(organism, v_gene)
    j_nucseq = get_j_cdr3_nucseq(organism, j_gene)

    ## what is the largest amount of these nucseqs we could preserve and still get cdr3_protseq
    max_v_germline = 0
    max_j_germline = 0

    len_v_nucseq = len(v_nucseq)
    len_j_nucseq = len(j_nucseq)
    len_cdr3_nucseq = len(cdr3_nucseq)
    len_cdr3_protseq = len(cdr3_protseq)

    if nucleotide_match:
        if allow_early_nucseq_mismatches:
            mismatch_score = default_mismatch_score_for_cdr3_nucseq_probabilities
        else:
            mismatch_score = -100
        max_v_germline = count_matches(v_nucseq, cdr3_nucseq, mismatch_score)

        max_j_germline = count_matches(''.join(reversed(list(j_nucseq))),
                                       ''.join(reversed(list(cdr3_nucseq))),
                                       mismatch_score)

        if allow_early_nucseq_mismatches:  ## obliterate the mismatches now
            max_v, max_j = max_v_germline, max_j_germline
            if max_v + max_j > len(cdr3_nucseq):
                ## some overlap!
                extra = max_v + max_j - len(cdr3_nucseq)
                #print 'TRIM extra',extra
                fake_v_trim = extra / 2  ## now dterministic
                fake_j_trim = extra - fake_v_trim
                max_v -= fake_v_trim
                max_j -= fake_j_trim
            old_cdr3_nucseq = cdr3_nucseq[:]
            cdr3_nucseq = v_nucseq[:max_v] + \
                          cdr3_nucseq[ max_v : len_cdr3_nucseq-max_j ] + \
                          j_nucseq[len_j_nucseq-max_j:]
            if old_cdr3_nucseq != cdr3_nucseq:
                Log('{} early_cdr3a_nucseq_mismatch: before {} after {}'.
                    format(theid, old_cdr3_nucseq, cdr3_nucseq))
                assert len(cdr3_nucseq) == len(old_cdr3_nucseq)

    else:
        ## V
        for i in range(len(v_nucseq)):
            i_aa = i / 3  ## which aa do we code for?
            len_codon = (i % 3) + 1
            if i_aa >= len(cdr3_protseq): break
            start = 3 * i_aa
            codon = v_nucseq[start:start + len_codon]
            target_aa = cdr3_protseq[i_aa]
            matched = False
            for c in reverse_genetic_code[target_aa]:
                if c.startswith(codon):
                    matched = True
            if verbose:
                print 'V', codon, target_aa, matched
            if matched:
                max_v_germline = i + 1
            else:
                break

        ## J
        for i in range(len_j_nucseq):
            i_aa = i / 3  ## which aa do we code for?
            len_codon = (i % 3) + 1
            if i_aa >= len(cdr3_protseq): break
            end = len(j_nucseq) - 3 * i_aa
            codon = j_nucseq[max(0, end - len_codon):end]
            target_aa = cdr3_protseq[len_cdr3_protseq - 1 - i_aa]
            matched = False
            for c in reverse_genetic_code[target_aa]:
                if c.endswith(codon):
                    matched = True
            if verbose:
                print 'J', codon, target_aa, matched
            if matched:
                max_j_germline = i + 1
            else:
                break

    if verbose:
        print 'max_v_germline:', max_v_germline, len(v_nucseq)

    ## how about J?

    min_insert = 3 * len_cdr3_protseq - max_v_germline - max_j_germline
    if verbose:
        print 'max_j_germline:',max_j_germline, len_j_nucseq,cdr3_protseq,\
            all_genes[organism][j_gene].protseq

        print 'min_insert:', min_insert, max_v_germline, max_j_germline

    if organism in ['human', 'mouse'] and j_gene[3] == 'B':
        trbj_index = int(j_gene[4])  ## to decide which d genes to allow
        assert trbj_index in [1, 2]
    else:
        ## no D/J compatibility check
        trbj_index = 0

    total_prob = 0.0
    min_extra_trim = max(0, -1 * min_insert)

    dids = tcr_rearrangement.all_trbd_nucseq[organism].keys()
    for extra_trim in range(min_extra_trim, 100):
        old_total_prob = total_prob
        total_prob_this_trim = 0.0
        for extra_v_trim in range(0, extra_trim + 1):
            extra_j_trim = extra_trim - extra_v_trim

            v_trim = len_v_nucseq - max_v_germline + extra_v_trim
            j_trim = len_j_nucseq - max_j_germline + extra_j_trim
            if v_trim > len_v_nucseq or j_trim > len_j_nucseq: continue

            n_insert = min_insert + extra_v_trim + extra_j_trim
            assert n_insert >= 0  ## b/c of min_extra_trim
            total_prob_this_insert = 0.0

            ## now we are looking to fit part of the D gene into this middle region and still code for the right aas
            for did in dids:
                if trbj_index == 1:
                    if did == 1:
                        did_prob = 1.0
                    else:
                        continue
                else:
                    did_prob = 1.0 / float(len(dids))
                d_nucseq = tcr_rearrangement.all_trbd_nucseq[organism][did]
                len_d_nucseq = len(d_nucseq)
                for d0_trim in range(len_d_nucseq + 1):
                    for d1_trim in range(len_d_nucseq + 1):
                        len_d_insert = len_d_nucseq - d0_trim - d1_trim
                        if len_d_insert < 0 or len_d_insert > n_insert:
                            continue
                        #if len_d_insert == 0 and d1_trim: continue ## only hit this one once!
                        d_insert = d_nucseq[d0_trim:len_d_nucseq - d1_trim]
                        num_n = n_insert - len_d_insert
                        for num_n_before_d in range(num_n + 1):
                            num_n_after_d = num_n - num_n_before_d
                            assert num_n_after_d >= 0

                            n_nucseq = (v_nucseq[:len_v_nucseq - v_trim] +
                                        'n' * num_n_before_d + d_insert +
                                        'n' * num_n_after_d +
                                        j_nucseq[j_trim:])

                            assert len(n_nucseq) == 3 * len_cdr3_protseq

                            trim_prob = tcr_rearrangement.get_beta_trim_probs(
                                organism, did, v_trim, d0_trim, d1_trim,
                                j_trim, num_n_before_d, num_n_after_d)
                            if not trim_prob: continue

                            if nucleotide_match:
                                assert len(n_nucseq) == len_cdr3_nucseq
                                matched = True
                                #print n_nucseq, cdr3_nucseq
                                for a, b in zip(n_nucseq, cdr3_nucseq):
                                    if a != b and a != 'n':
                                        matched = False
                                if matched:
                                    coding_prob = 0.25**num_n
                                else:
                                    coding_prob = 0.0
                            else:
                                coding_prob = get_coding_probability(
                                    n_nucseq, cdr3_protseq)
                            prob = did_prob * coding_prob * trim_prob

                            total_prob_this_insert += prob  ## just for status output
                            total_prob_this_trim += prob
                            total_prob += prob

                            if verbose and coding_prob:
                                print 'coding_prob:',cdr3_protseq,"trims:",v_trim,d0_trim,d1_trim,j_trim,\
                                    "inserts:",num_n_before_d,num_n_after_d,\
                                    "d_insert:",d_insert,\
                                    "total_prob:",total_prob,"prob:",prob,"coding_prob:",coding_prob,\
                                    "trim_prob:",trim_prob,n_nucseq

            if verbose:
                print 'n_insert:',n_insert,extra_v_trim,extra_j_trim,'total_prob:',total_prob,\
                    'total_prob_this_insert:',total_prob_this_insert

        if extra_trim > 2 and total_prob_this_trim < error_threshold * old_total_prob:
            break

    if return_final_cdr3_nucseq:
        return total_prob, cdr3_nucseq
    else:
        return total_prob
Exemplo n.º 4
0
def alpha_cdr3_protseq_probability(theid,
                                   organism,
                                   v_gene,
                                   j_gene,
                                   cdr3_protseq,
                                   cdr3_nucseq='',
                                   error_threshold=0.05,
                                   verbose=False,
                                   allow_early_nucseq_mismatches=True,
                                   return_final_cdr3_nucseq=False):
    nucleotide_match = (cdr3_nucseq != '')
    if nucleotide_match:
        assert not cdr3_protseq
        cdr3_protseq = translation.get_translation(cdr3_nucseq, '+1')[0]
        assert len(cdr3_nucseq) == 3 * len(cdr3_protseq)

    ab = 'A'
    assert all_genes[organism][v_gene].chain == ab

    v_nucseq = get_v_cdr3_nucseq(organism, v_gene)
    j_nucseq = get_j_cdr3_nucseq(organism, j_gene)

    ## what is the largest amount of these nucseqs we could preserve and still get cdr3_protseq

    max_v_germline = 0
    len_v_nucseq = len(v_nucseq)
    max_j_germline = 0

    len_j_nucseq = len(j_nucseq)
    len_cdr3_protseq = len(cdr3_protseq)
    len_cdr3_nucseq = len(cdr3_nucseq)

    if nucleotide_match:
        if allow_early_nucseq_mismatches:
            mismatch_score = default_mismatch_score_for_cdr3_nucseq_probabilities
        else:
            mismatch_score = -100
        max_v_germline = count_matches(v_nucseq, cdr3_nucseq, mismatch_score)

        max_j_germline = count_matches(''.join(reversed(list(j_nucseq))),
                                       ''.join(reversed(list(cdr3_nucseq))),
                                       mismatch_score)

        if allow_early_nucseq_mismatches:  ## obliterate the mismatches now
            max_v, max_j = max_v_germline, max_j_germline
            if max_v + max_j > len(cdr3_nucseq):
                ## some overlap!
                extra = max_v + max_j - len(cdr3_nucseq)
                #print 'TRIM extra',extra
                fake_v_trim = extra / 2  ## now dterministic
                fake_j_trim = extra - fake_v_trim
                max_v -= fake_v_trim
                max_j -= fake_j_trim
            old_cdr3_nucseq = cdr3_nucseq[:]
            cdr3_nucseq = v_nucseq[:max_v] + \
                          cdr3_nucseq[ max_v : len_cdr3_nucseq-max_j ] + \
                          j_nucseq[len_j_nucseq-max_j:]
            if old_cdr3_nucseq != cdr3_nucseq:
                Log('{} early_cdr3a_nucseq_mismatch: {} {} before {} after {}'.
                    format(theid, v_gene, j_gene, old_cdr3_nucseq,
                           cdr3_nucseq))
                assert len(cdr3_nucseq) == len(old_cdr3_nucseq)
    else:

        for i in range(len(v_nucseq)):
            i_aa = i / 3  ## which aa do we code for?
            len_codon = (i % 3) + 1
            if i_aa >= len(cdr3_protseq): break
            start = 3 * i_aa
            codon = v_nucseq[start:start + len_codon]
            target_aa = cdr3_protseq[i_aa]
            matched = False
            for c in reverse_genetic_code[target_aa]:
                if c.startswith(codon):
                    matched = True
            if verbose:
                print 'V', codon, target_aa, matched
            if matched:
                max_v_germline = i + 1
            else:
                break

        ## how about J?
        for i in range(len_j_nucseq):
            i_aa = i / 3  ## which aa do we code for?
            len_codon = (i % 3) + 1
            if i_aa >= len(cdr3_protseq): break
            end = len(j_nucseq) - 3 * i_aa
            codon = j_nucseq[max(0, end - len_codon):end]
            target_aa = cdr3_protseq[len_cdr3_protseq - 1 - i_aa]
            matched = False
            for c in reverse_genetic_code[target_aa]:
                if c.endswith(codon):
                    matched = True
            if verbose:
                print 'J', codon, target_aa, matched
            if matched:
                max_j_germline = i + 1
            else:
                break

    min_insert = 3 * len_cdr3_protseq - max_v_germline - max_j_germline
    if verbose:
        print 'max_v_germline:', max_v_germline, len_v_nucseq, v_nucseq, cdr3_nucseq

        print 'max_j_germline:',max_j_germline, len_j_nucseq, j_nucseq, cdr3_nucseq, \
            all_genes[organism][j_gene].protseq

        print 'min_insert:', min_insert, max_v_germline, max_j_germline

    total_prob = 0.0
    min_extra_trim = max(0, -1 * min_insert)
    for extra_trim in range(min_extra_trim, 100):
        old_total_prob = total_prob
        total_prob_this_trim = 0.0
        for extra_v_trim in range(0, extra_trim + 1):
            extra_j_trim = extra_trim - extra_v_trim

            v_trim = len_v_nucseq - max_v_germline + extra_v_trim
            j_trim = len_j_nucseq - max_j_germline + extra_j_trim
            if v_trim > len_v_nucseq or j_trim > len_j_nucseq: continue

            n_insert = min_insert + extra_v_trim + extra_j_trim
            n_nucseq = v_nucseq[:len_v_nucseq -
                                v_trim] + 'n' * n_insert + j_nucseq[j_trim:]

            assert len(n_nucseq) == 3 * len_cdr3_protseq
            if nucleotide_match:
                coding_prob = 0.25**n_insert
            else:
                coding_prob = get_coding_probability(n_nucseq, cdr3_protseq)

            trim_prob = tcr_rearrangement.get_alpha_trim_probs(
                organism, v_trim, j_trim, n_insert)

            total_prob_this_trim += coding_prob * trim_prob
            total_prob += coding_prob * trim_prob

            if verbose:
                print 'coding_prob:', cdr3_protseq, v_trim, j_trim, n_insert, total_prob, coding_prob, trim_prob, n_nucseq

        if extra_trim > 2 and total_prob_this_trim < error_threshold * old_total_prob:
            break
    if return_final_cdr3_nucseq:
        return total_prob, cdr3_nucseq
    else:
        return total_prob
Exemplo n.º 5
0
                if ( a == '*' and not allow_stop_codons) or ( a == 'X' and not allow_X ):
                    Log('{} skipping: badseq: {} {}'.format(theid, cdr3a_protseq,cdr3b_protseq))
                    skip_me = True
                    break

        if skip_me:
            continue

    ## probs are computed by reps
    va_reps = l['va_reps'].split(';')
    ja_reps = l['ja_reps'].split(';')
    va_countreps = l['va_countreps'].split(';')
    ja_countreps = l['ja_countreps'].split(';')
    va_cdr3_nucseq = tcr_sampler.get_v_cdr3_nucseq( organism, va_gene )
    ja_cdr3_nucseq = tcr_sampler.get_j_cdr3_nucseq( organism, ja_gene )
    va_cdr3_protseq,codons = get_translation( va_cdr3_nucseq, '+1' )
    ja_cdr3_protseq,codons = get_translation( ja_cdr3_nucseq, '+{}'.format(1+len(ja_cdr3_nucseq)%3))

    if no_probabilities or not tcr_rearrangement.probs_data_exist( organism,'A'):
        ##all probabilities will be set to 1 if this flag is set
        aprob_nucseq = 1
        aprob_protseq = 1
    else:
        aprob_nucseq,new_cdr3a_nucseq = tcr_sampler.alpha_cdr3_protseq_probability( theid, organism, va_gene, ja_gene,
                                                                                cdr3_protseq='',
                                                                                cdr3_nucseq=cdr3a_nucseq,  verbose=verbose,
                                                                                return_final_cdr3_nucseq=True )

        if new_cdr3a_nucseq != cdr3a_nucseq: ## note note note
            print 'new_cdr3a_nucseq:',len(new_cdr3a_nucseq),new_cdr3a_nucseq
            print 'old_cdr3a_nucseq:',len(cdr3a_nucseq),cdr3a_nucseq
Exemplo n.º 6
0
def parse_unpaired_dna_sequence_blastn(organism,
                                       ab,
                                       blast_seq,
                                       info,
                                       verbose,
                                       nocleanup,
                                       hide_nucseq,
                                       extended_cdr3,
                                       return_all_good_hits=False,
                                       max_bit_score_delta_for_good_hits=50,
                                       max_missing_aas_at_cdr3_cterm=2):

    ## make this a little more unique
    blast_tmpfile = 'tmp%d%s%s%f%s.fa' % (len(blast_seq), organism, ab,
                                          random.random(), blast_seq[:3])
    #print 'blast_tmpfile:',blast_tmpfile
    #assert not exists(blast_tmpfile)

    genes = ('UNK', 'UNK', [100, 0], 'UNK', 'UNK', [100, 0], '-')

    status = []

    evalues = {'V' + ab: (1, 0), 'J' + ab: (1, 0)}

    all_good_hits_with_scores = [[], []]

    if verbose:
        print 'blast_seq:', info, ab, blast_seq

    if len(blast_seq) <= 20:
        status.append('short_{}_blast_seq_{}'.format(ab, len(blast_seq)))
    else:

        out = open(blast_tmpfile, 'w')
        out.write('>tmp\n%s\n' % blast_seq)
        out.close()

        ## now blast against V and J
        top_hits = []

        for ivj, vj in enumerate('VJ'):
            dbfile = get_blast_nucseq_database(
                organism, ab, vj)  # also ensures that it exists
            assert exists(dbfile)
            blastall_exe = path_to_blast_executables + '/blastall'
            assert exists(blastall_exe)
            cmd = '%s -F F -p blastn -i %s -d %s -v 100 -b 1 -o %s.blast'\
                  %( blastall_exe, blast_tmpfile, dbfile, blast_tmpfile )
            #print cmd
            system(cmd)

            if verbose:
                print 'blast:', info, ab, vj, '=' * 50
                print ''.join(open(blast_tmpfile + '.blast', 'r').readlines())
                print '=' * 80

            ## try parsing the results
            evalue_threshold = 1e-1
            identity_threshold = 20
            hits = blast.parse_blast_alignments(blast_tmpfile + '.blast',
                                                evalue_threshold,
                                                identity_threshold)
            hits_scores = get_all_hits_with_evalues_and_scores(
                blast_tmpfile + '.blast')  ## id,bitscore,evalue
            if hits and hits[hits.keys()[0]]:

                top_hit = hits[hits.keys()[0]][0]
                top_id, top_bit_score, top_evalue = hits_scores[0]

                all_good_hits_with_scores[ivj] \
                    = [ x for x in hits_scores if top_bit_score-x[1] <= max_bit_score_delta_for_good_hits ]

                assert top_hit.hit_id == top_id
                ## figure out the score gap to the next non-equivalen
                bit_score_gap = top_bit_score
                top_rep = all_genes[organism][top_id].rep
                for (id, bit_score, evalue) in hits_scores[1:]:
                    if all_genes[organism][id].rep != top_rep:
                        bit_score_gap = top_bit_score - bit_score
                        break
                evalues[vj + ab] = (top_hit.evalue, bit_score_gap)
                top_hits.append(top_hit)
            else:
                status.append('no_{}{}_blast_hits'.format(vj, ab))

        if len(top_hits) == 2:  ## hits in both v and j

            v_hit = top_hits[0]
            j_hit = top_hits[1]

            v_gene = v_hit.hit_id
            j_gene = j_hit.hit_id
            v_rep = all_genes[organism][v_gene].rep
            j_rep = all_genes[organism][j_gene].rep

            v_nucseq = all_fasta[organism][ab]['V'][nuc][v_hit.hit_id]
            j_nucseq = all_fasta[organism][ab]['J'][nuc][j_hit.hit_id]
            v_protseq = all_fasta[organism][ab]['V'][prot][v_hit.hit_id]

            ## this might fail if these guys are pseudo-genes...
            ## so filter out the non-aa-matching j genes...
            ##
            v_hitseq_frame = all_offsets[organism][ab]['V'][v_hit.hit_id]
            j_hitseq_frame = all_offsets[organism][ab]['J'][j_hit.hit_id]

            ## tricky if the hits are on different strands!
            ##
            assert v_hit.q_strand == 1  ## I think this is the blastn convention...
            assert j_hit.q_strand == 1

            if v_hit.h_strand != j_hit.h_strand:
                Log( ` ('ERR V/J strand mismatch:', v_hit.h_strand,
                        v_hit.evalue, j_hit.h_strand, j_hit.evalue) `)
                genes = (v_gene.replace('TRAV',
                                        'TRaV').replace('TRBV', 'TRbV'),
                         v_rep.replace('TRAV',
                                       'TRaV').replace('TRBV',
                                                       'TRbV'), [100, 0],
                         j_gene.replace('TRAJ',
                                        'TRaJ').replace('TRBJ', 'TRbJ'),
                         j_rep.replace('TRAJ',
                                       'TRaJ').replace('TRBJ',
                                                       'TRbJ'), [100, 0], '-')
                status.append('vj_{}_strand_mismatch'.format(ab))
            else:

                v_q2hmap = v_hit.q2hmap
                j_q2hmap = j_hit.q2hmap

                if v_hit.h_strand == -1:
                    ## switch stuff around...
                    ## have to mess with the alignment too
                    v_q2hmap = reverse_q2hmap(blast_seq, v_nucseq, v_hit)
                    j_q2hmap = reverse_q2hmap(blast_seq, j_nucseq, j_hit)

                    blast_seq = logo_tools.reverse_complement(blast_seq)
                    if verbose:
                        print 'reverse-comp blast_seq:', ab

                q_vframes = {}
                for qpos, (vpos, vna) in v_q2hmap.iteritems():
                    if vpos >= 0:
                        f = (qpos - vpos + v_hitseq_frame) % 3
                        q_vframes[f] = q_vframes.get(f, 0) + 1
                q_vframe = max([(count, x)
                                for x, count in q_vframes.iteritems()])[1]

                q_jframes = {}
                for qpos, (jpos, jna) in j_q2hmap.iteritems():
                    if jpos >= 0:
                        f = (qpos - jpos + j_hitseq_frame) % 3
                        q_jframes[f] = q_jframes.get(f, 0) + 1

                q_jframe = max([(count, x)
                                for x, count in q_jframes.iteritems()])[1]

                #q_frame_vstart = ( v_hitseq_frame + v_hit.q_start - v_hit.h_start )%3
                #q_frame_jstart = ( j_hitseq_frame + j_hit.q_start - j_hit.h_start )%3

                ## construct a protein sequence alignment between translation of blast_seq and
                q2v_align = {}
                for qpos, (vpos, vna) in sorted(v_q2hmap.iteritems()):
                    if vpos >= 0:
                        f = (qpos - vpos + v_hitseq_frame) % 3
                        if f != q_vframe: continue
                        v_protpos = (vpos - v_hitseq_frame) / 3
                        q_protpos = (qpos - q_vframe) / 3
                        if q_protpos in q2v_align:
                            if q2v_align[q_protpos] != v_protpos:
                                Log('indel?? {} {} {}'.format(
                                    organism, ab, info))

                        q2v_align[q_protpos] = v_protpos
                        ## this could be aligning a position that's not actually in the translated protein
                        ## sequence if there are 1 or 2 nucleotides at the end...

                if q_vframe != q_jframe:  ## out of frame
                    Log( ` ('ERR frame mismatch:', q_vframe, v_hit.evalue,
                            q_jframe, j_hit.evalue) `)
                    if verbose:
                        print 'frame mismatch', q_vframe, q_jframe
                    # genes = ( v_gene.replace('TRAV','TRaV' ).replace('TRBV','TRbV'),
                    #           v_rep .replace('TRAV','TRaV' ).replace('TRBV','TRbV'), [100,0],
                    #           j_gene.replace('TRAJ','TRaJ' ).replace('TRBJ','TRbJ'),
                    #           j_rep .replace('TRAJ','TRaJ' ).replace('TRBJ','TRbJ'), [100,0], '-' )
                    status.append('vj_{}_frame_mismatch'.format(ab))

                    ## fiddle with blast_seq
                    ## for each 'extra' nucleotide inserted between v and j, add two '#' characters after the nucleotide
                    last_v_align_pos = max(v_q2hmap.keys())
                    first_j_align_pos = min(j_q2hmap.keys())

                    ## add some '#' characters to blast_seq to get V and J back into frame
                    num_to_insert = (q_vframe - q_jframe) % 3
                    insertpos = max(last_v_align_pos + 1,
                                    (last_v_align_pos + first_j_align_pos) / 2)

                    blast_seq = blast_seq[:
                                          insertpos] + '#' * num_to_insert + blast_seq[
                                              insertpos:]

                    # num_inserted_nucleotides = (q_jframe - q_vframe)%3
                    # new_blast_seq = blast_seq[:last_q_align_pos+1]
                    # extra_seq = blast_seq[last_q_align_pos+1:]
                    # for i in range(num_inserted_nucleotides):
                    #     new_blast_seq += extra_seq[0] + '##'
                    #     extra_seq = extra_seq[1:]
                    # new_blast_seq += extra_seq
                    # blast_seq = new_blast_seq[:]

                qseq, codons = get_translation(blast_seq,
                                               '+%d' % (q_vframe + 1))
                cdr3, v_mm, j_mm, errors = parse_cdr3.parse_cdr3(
                    organism,
                    ab,
                    qseq,
                    v_hit.hit_id,
                    j_hit.hit_id,
                    q2v_align,
                    extended_cdr3=extended_cdr3,
                    max_missing_aas_at_cdr3_cterm=max_missing_aas_at_cdr3_cterm
                )
                if verbose:
                    print 'cdr3:', ab, cdr3, cdr3 in qseq, 'q_vframe:', q_vframe

                status.extend(errors)

                if cdr3 != '-':
                    ## the cdr3 sequence should be contained in qseq, unless qseq was missing 1-2 rsds at cterm
                    if not hide_nucseq:
                        if cdr3 in qseq:  ## the old way, without any missing C-term rsds of CDR3
                            offset = qseq.find(cdr3)
                            cdr3_codons = codons[offset:offset + len(cdr3)]
                            cdr3 += '-' + ''.join(cdr3_codons)
                        else:
                            num_missing_cterm_aas = 1
                            while num_missing_cterm_aas < max_missing_aas_at_cdr3_cterm and \
                                  cdr3[:-1*num_missing_cterm_aas] not in qseq:
                                num_missing_cterm_aas += 1
                            assert cdr3[:-1 * num_missing_cterm_aas] in qseq
                            ## this is a nuisance...
                            assert extended_cdr3  # it's the new default anyhow
                            jg = all_genes[organism][j_hit.hit_id]
                            j_nucseq = jg.nucseq
                            j_cdr3len = len(jg.cdrs[0].replace(
                                gap_character, ''))
                            j_cdr3_nucseq = jg.nucseq[:jg.nucseq_offset +
                                                      3 * j_cdr3len]
                            missing_nucseq = j_cdr3_nucseq[
                                -3 * num_missing_cterm_aas:]
                            offset = qseq.find(cdr3[:-1 *
                                                    num_missing_cterm_aas])
                            cdr3_codons = codons[offset:offset + len(cdr3) -
                                                 num_missing_cterm_aas]
                            cdr3_nucseq = ''.join(cdr3_codons) + missing_nucseq
                            assert len(cdr3_nucseq) == 3 * len(cdr3)
                            assert get_translation(cdr3_nucseq,
                                                   '+1')[0] == cdr3
                            cdr3 += '-' + cdr3_nucseq

                        # if verbose:
                        #     cdr3_nucseq = ''.join( cdr3_codons ).upper()
                        #     nucseq_startpos = 3*offset + q_vframe
                        #     alt_nucseq = blast_seq[ nucseq_startpos:nucseq_startpos+len(cdr3_nucseq) ]
                        #     rc1 = logo_tools.reverse_complement(cdr3_nucseq)
                        #     rc2 = logo_tools.reverse_complement(blast_seq)
                        #     print 'cdr3_nucseq',ab,offset,cdr3_nucseq,cdr3_nucseq in blast_seq,\
                        #         blast_seq.index(cdr3_nucseq),alt_nucseq,rc1 in rc2
                if '#' in blast_seq:  ## sign of out-of-frame
                    v_gene = v_gene.replace('TRAV',
                                            'TRaV').replace('TRBV', 'TRbV')
                    v_rep = v_rep.replace('TRAV',
                                          'TRaV').replace('TRBV', 'TRbV')
                    j_gene = j_gene.replace('TRAJ',
                                            'TRaJ').replace('TRBJ', 'TRbJ')
                    j_rep = j_rep.replace('TRAJ',
                                          'TRaJ').replace('TRBJ', 'TRbJ')
                    protseq, nucseq = cdr3.split('-')
                    if protseq and nucseq:
                        if protseq.count('#') != nucseq.count('#'):
                            assert nucseq.count('#') == 2
                            assert protseq.count('#') == 1
                            protseq = protseq.replace('#', '##')
                            cdr3 = '{}-{}'.format(protseq, nucseq)

                genes = (v_gene, v_rep, v_mm, j_gene, j_rep, j_mm, cdr3)

                if cdr3 != "-":
                    cdr3aa = cdr3.split("-")[0]
                    if len(cdr3aa) < 5:
                        status.append('cdr3{}_len_too_short'.format(ab))

    if not nocleanup:
        files = glob(blast_tmpfile + '*')
        for file in files:
            remove(file)

    assert len(genes) == 7

    if return_all_good_hits:
        return genes, evalues, status, all_good_hits_with_scores  ## status is a list, maybe be empty
    else:
        return genes, evalues, status  ## status is a list, maybe be empty
Exemplo n.º 7
0
        if len(cdr3_nucseq) % 3:
            if woof:
                print 'OOF {} {} {} {} {:d} {:d} {} {} {} {}:{}:{}'\
                    .format( v_gene, v_rep, j_gene, j_rep,
                             v_score, j_score,
                             ','.join(all_v_genes),
                             ','.join(all_j_genes),
                             cdr3_nucseq,
                             logfile, fastq_file, seqid
                    )
            continue
        if len(cdr3_nucseq) / 3 < min_cdr3_len:
            continue

        ## in frame
        cdr3_protseq, codons = get_translation(cdr3_nucseq, '+1')

        if '*' in cdr3_protseq or 'X' in cdr3_protseq:
            continue

        original_cdr3_nucseq = cdr3_nucseq[:]
        original_cdr3_protseq = cdr3_protseq[:]

        if chain == 'A':
            if correct_cdr3_seqs:
                tmp_results = tcr_sampler.analyze_junction\
                              ( organism, v_gene, j_gene, cdr3_protseq, cdr3_nucseq,
                                return_corrected_cdr3_seqs = True,
                                mismatch_score = mismatch_score_for_correcting_cdr3_seqs )
                corrected_cdr3_nucseq, corrected_cdr3_protseq = list(
                    tmp_results)[-2:]