示例#1
0
def reconstruct_indelfo_from_indel_list(indel_list, line, iseq, debug=False):  # old-style files
    if 'reversed_seq' in indel_list:  # handle super-old files
        print '%s encountered file with super old, unhandled indel format, proceeding, but indel info may be inconsistent' % (utils.color('red', 'error'))
        return

    line['indelfos'][iseq] = get_empty_indel()
    if len(indel_list) == 0:
        return

    ifo_positions = [ifo['pos'] for ifo in indel_list]
    if len(ifo_positions) != len(set(ifo_positions)):
        print '%s two indels at the same position, everything will be kinda messed up' % utils.color('red', 'error')
    ifos_by_pos = {ifo['pos'] : ifo for ifo in indel_list}
    qr_gap_seq, gl_gap_seq = [], []
    iqr, igl, iindel = 0, 0, 0
    if debug:
        print len(line['input_seqs'][iseq]), line['input_seqs'][iseq]
        print len(line['naive_seq']), line['naive_seq']
    while iqr < len(line['input_seqs'][iseq]):
        if debug:
            print '  %3d  %3d' % (iqr, igl),
        if iindel in ifos_by_pos:
            ifo = ifos_by_pos[iindel]
            if ifo['type'] == 'insertion':
                if ifo['seqstr'] != line['input_seqs'][iseq][iqr : iqr + ifo['len']]:
                    print '%s indel info seqstr doesn\'t match input seq str:' % utils.color('red', 'error')
                    utils.color_mutants(ifo['seqstr'], line['input_seqs'][iseq][iqr : iqr + ifo['len']], align=True, print_result=True, extra_str='        ')
                qr_gap_seq += ifo['seqstr'].split()
                gl_gap_seq += [ifo['len'] * utils.gap_chars[0]]
                if debug:
                    print '  %s    %s' % (ifo['seqstr'].split(), [ifo['len'] * utils.gap_chars[0]])
                iqr += ifo['len']
            else:
                if ifo['seqstr'] != line['naive_seq'][igl : igl + ifo['len']]:
                    print '%s indel info seqstr doesn\'t match naive seq str:' % utils.color('red', 'error')
                    utils.color_mutants(ifo['seqstr'], line['naive_seq'][igl : igl + ifo['len']], align=True, print_result=True, extra_str='        ')
                qr_gap_seq += [ifo['len'] * utils.gap_chars[0]]
                gl_gap_seq += ifo['seqstr'].split()
                if debug:
                    print '  %s    %s' % ([ifo['len'] * utils.gap_chars[0]], ifo['seqstr'].split())
                igl += ifo['len']
            del ifos_by_pos[iindel]
            iindel += ifo['len']
        else:
            qr_gap_seq += [line['input_seqs'][iseq][iqr]]
            gl_gap_seq += [line['naive_seq'][igl]]
            if debug:
                print '  %s    %s' % (line['input_seqs'][iseq][iqr], line['naive_seq'][igl])
            iqr += 1
            igl += 1
            iindel += 1

    line['indelfos'][iseq]['qr_gap_seq'] = ''.join(qr_gap_seq)
    line['indelfos'][iseq]['gl_gap_seq'] = ''.join(gl_gap_seq)
    line['indelfos'][iseq]['indels'] = indel_list
    line['indelfos'][iseq]['reversed_seq'] = line['indel_reversed_seqs'][iseq]
    line['indelfos'][iseq]['genes'] = {r : line[r + '_gene'] for r in utils.regions}
    if debug:
        print '  reconstructed indelfo'
        print get_dbg_str(line['indelfos'][iseq])
示例#2
0
 def check_single_ifo(old_ifo, new_ifo):
     if debug:
         print '  len %d  pos %d  seqstr %s' % (
             old_ifo['len'], old_ifo['pos'], old_ifo['seqstr']),
     if new_ifo != old_ifo:
         if debug:
             print '  %s' % utils.color('red', 'nope')
         new_seqstr, old_seqstr = utils.color_mutants(
             old_ifo['seqstr'],
             new_ifo['seqstr'],
             return_ref=True,
             align=True)  #len(old_ifo['seqstr']) != len(new_ifo['seqstr']))
         if print_on_err:
             print '  pos %d --> %s    len %d --> %s    seqstr %s --> %s' % (
                 old_ifo['pos'],
                 utils.color(
                     None if new_ifo['pos'] == old_ifo['pos'] else 'red',
                     '%d' % new_ifo['pos']), old_ifo['len'],
                 utils.color(
                     None if new_ifo['len'] == old_ifo['len'] else 'red',
                     '%d' % new_ifo['len']), old_seqstr, new_seqstr)
         return False
     else:
         if debug:
             print '  %s' % utils.color('green', 'ok')
         return True
示例#3
0
文件: glutils.py 项目: Annak17/partis
def add_new_allele(glfo, newfo, remove_template_genes, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'template-gene' : 'IGHV3-71*01', 'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo['template-gene']
    region = utils.get_region(template_gene)
    if template_gene not in glfo['seqs'][region]:
        raise Exception('unknown template gene %s' % template_gene)

    new_gene = newfo['gene']

    if region == 'v':
        glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene]
    elif region == 'j':
        glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene]

    glfo['seqs'][region][new_gene] = newfo['seq']

    if debug:
        print '    adding new allele to glfo:'
        print '      template %s   %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene))
        print '           new %s   %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene))

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
示例#4
0
 def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False):
     if self.debug < 2:
         return
     out_str_list = []
     buff_str = (20 - len(gene)) * ' '
     tmp_val = score
     if self.args.apply_choice_probs_in_sw and self.get_choice_prob(region, gene) != 0.0:
         tmp_val = score / self.get_choice_prob(region, gene)
     if self.args.apply_choice_probs_in_sw:
         out_str_list.append('%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score))
     else:
         out_str_list.append('%8s%s%s%9s%3s %6.0f        ' % (' ', utils.color_gene(gene), '', '', buff_str, score))
     out_str_list.append('%4d%4d   %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]))
     out_str_list.append('%46s  %4d%4d' % ('', qrbounds[0], qrbounds[1]))
     out_str_list.append('   %s ' % (utils.color_mutants(self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]])))
     if region != 'd':
         out_str_list.append('(%s %d)' % (utils.conserved_codon_names[region], codon_pos))
     if warnings[gene] != '':
         out_str_list.append('WARNING ' + warnings[gene])
     if skipping:
         out_str_list.append('skipping!')
     if self.args.outfname is None:
         print ''.join(out_str_list)
     else:
         out_str_list.append('\n')
         self.outfile.write(''.join(out_str_list))
示例#5
0
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo["template-gene"]
    region = utils.get_region(template_gene)
    if template_gene not in glfo["seqs"][region]:
        raise Exception("unknown template gene %s" % template_gene)

    new_gene = newfo["gene"]

    if region == "v":
        glfo["cyst-positions"][new_gene] = glfo["cyst-positions"][template_gene]
    elif region == "j":
        glfo["tryp-positions"][new_gene] = glfo["tryp-positions"][template_gene]

    glfo["seqs"][region][new_gene] = newfo["seq"]

    if debug:
        print "    adding new allele to glfo:"
        print "      template %s   %s" % (glfo["seqs"][region][template_gene], utils.color_gene(template_gene))
        print "           new %s   %s" % (
            utils.color_mutants(glfo["seqs"][region][template_gene], newfo["seq"]),
            utils.color_gene(new_gene),
        )

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
示例#6
0
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo['template-gene']
    region = utils.get_region(template_gene)
    if template_gene not in glfo['seqs'][region]:
        raise Exception('unknown template gene %s' % template_gene)

    new_gene = newfo['gene']

    if region == 'v':
        glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene]
    elif region == 'j':
        glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene]

    glfo['seqs'][region][new_gene] = newfo['seq']

    if debug:
        print '    adding new allele to glfo:'
        print '      template %s   %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene))
        print '           new %s   %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene))

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
示例#7
0
    def add_new_allele(self, gene, fitfo, n_candidate_snps, debug=False):
        # figure out what the new nukes are
        old_seq = self.glfo['seqs'][utils.get_region(gene)][gene]
        new_seq = old_seq
        mutfo = {}
        for pos in sorted(fitfo['candidates'][n_candidate_snps]):
            obs_counts = {nuke : self.counts[gene][pos][n_candidate_snps][nuke] for nuke in utils.nukes}  # NOTE it's super important to only use the counts from sequences with <n_candidate_snps> total mutations
            sorted_obs_counts = sorted(obs_counts.items(), key=operator.itemgetter(1), reverse=True)
            original_nuke = self.mfreqer.counts[gene][pos]['gl_nuke']
            new_nuke = None
            for nuke, _ in sorted_obs_counts:  # take the most common one that isn't the existing gl nuke
                if nuke != original_nuke:
                    new_nuke = nuke
                    break
            print '   %3d  (%s --> %s)' % (pos, original_nuke, new_nuke),
            assert old_seq[pos] == original_nuke
            mutfo[pos] = {'original' : original_nuke, 'new' : new_nuke}
            new_seq = new_seq[:pos] + new_nuke + new_seq[pos+1:]

        new_name, mutfo = glutils.get_new_allele_name_and_change_mutfo(gene, mutfo)
        print ''
        print '          %s   %s' % (old_seq, utils.color_gene(gene))
        print '          %s   %s' % (utils.color_mutants(old_seq, new_seq), utils.color_gene(new_name))

        # and add it to the set of new alleles for this gene
        self.new_allele_info.append({
            'template-gene' : gene,
            'gene' : new_name,
            'seq' : new_seq,
            'aligned-seq' : None
        })
示例#8
0
def trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=False):
    nearest_template_gene = glutils.find_nearest_gene_using_names(
        template_glfo, gene)
    nearest_template_seq = template_glfo['seqs'][region][nearest_template_gene]
    # extra_bases = glfo['cyst-positions'][gene] - template_glfo['cyst-positions'][nearest_template_gene]  # not right if there's some internal gaps in the alignment
    aligned_nearest_template_seq, aligned_seq = utils.align_seqs(
        nearest_template_seq, seq)

    if debug:
        print '    %s' % utils.color_gene(gene)
        utils.color_mutants(aligned_nearest_template_seq,
                            aligned_seq,
                            print_result=True,
                            ref_label='template ',
                            extra_str='       ')

    if aligned_seq[0] not in utils.gap_chars and aligned_nearest_template_seq[
            0] not in utils.gap_chars:
        if debug:
            print '      ok'
    elif aligned_seq[0] in utils.gap_chars:
        if debug:
            print '      %s, removing' % utils.color('red', 'too small')
        glutils.remove_gene(glfo, gene)
    else:
        if debug:
            print '        extra bases %s' % utils.color_gene(gene)
        extra_bases = len(aligned_nearest_template_seq) - len(
            aligned_nearest_template_seq.lstrip('-'))
        seq = seq[extra_bases:]
        if debug:
            print '          removed %d bases' % extra_bases
        if seq in glfo['seqs'][region].values():
            print '    trimmed seq already in glfo under name %s, so removing it' % ' '.join(
                [
                    utils.color_gene(g)
                    for g, s in glfo['seqs'][region].items() if s == seq
                ])
            glutils.remove_gene(glfo, gene, debug=True)
            return
        glfo['seqs'][region][gene] = seq
        glfo['cyst-positions'][gene] -= extra_bases
        # utils.color_mutants(nearest_template_seq, seq, print_result=True, ref_label='template ', align=True, extra_str='            ')
        assert utils.codon_unmutated('cyst',
                                     glfo['seqs'][region][gene],
                                     glfo['cyst-positions'][gene],
                                     debug=True)
示例#9
0
    def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False):
        out_str_list = []
        buff_str = (20 - len(gene)) * ' '
        out_str_list.append('%8s%s%s%9s%3s %6.0f        ' % (' ', utils.color_gene(gene), '', '', buff_str, score))
        out_str_list.append('%4d%4d   %s\n' % (glbounds[0], glbounds[1], self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]]))
        out_str_list.append('%46s  %4d%4d' % ('', qrbounds[0], qrbounds[1]))
        out_str_list.append('   %s ' % (utils.color_mutants(self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]])))
        if region != 'd':
            out_str_list.append('(%s %d)' % (utils.conserved_codons[region], codon_pos))
        if warnings[gene] != '':
            out_str_list.append('WARNING ' + warnings[gene])
        if skipping:
            out_str_list.append('skipping!')

        print ''.join(out_str_list)
示例#10
0
def print_lines(nseq_info, ref_seq, namestr, namecolor):
    assert nseq_info == sorted(nseq_info,
                               key=operator.itemgetter(1),
                               reverse=True)
    total_prob = 0.
    for naive_seq, prob in nseq_info:
        print '  %s   %5.2f  %s' % (utils.color_mutants(
            naive_seq if ref_seq is None else ref_seq,
            naive_seq), prob, utils.color(namecolor, namestr))
        if ref_seq is None:
            ref_seq = naive_seq
        if 1. - total_prob < args.prob_to_ignore:
            break
        total_prob += prob
    return ref_seq
示例#11
0
 def print_match(self,
                 region,
                 gene,
                 query_seq,
                 score,
                 glbounds,
                 qrbounds,
                 codon_pos,
                 warnings,
                 skipping=False):
     if self.debug < 2:
         return
     out_str_list = []
     buff_str = (20 - len(gene)) * ' '
     tmp_val = score
     if self.args.apply_choice_probs_in_sw and self.get_choice_prob(
             region, gene) != 0.0:
         tmp_val = score / self.get_choice_prob(region, gene)
     if self.args.apply_choice_probs_in_sw:
         out_str_list.append(
             '%8s%s%s%9.1e * %3.0f = %-6.1f' %
             (' ', utils.color_gene(gene), buff_str,
              self.get_choice_prob(region, gene), tmp_val, score))
     else:
         out_str_list.append(
             '%8s%s%s%9s%3s %6.0f        ' %
             (' ', utils.color_gene(gene), '', '', buff_str, score))
     out_str_list.append(
         '%4d%4d   %s\n' %
         (glbounds[0], glbounds[1],
          self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]))
     out_str_list.append('%46s  %4d%4d' % ('', qrbounds[0], qrbounds[1]))
     out_str_list.append('   %s ' % (utils.color_mutants(
         self.germline_seqs[region][gene][glbounds[0]:glbounds[1]],
         query_seq[qrbounds[0]:qrbounds[1]])))
     if region != 'd':
         out_str_list.append(
             '(%s %d)' % (utils.conserved_codon_names[region], codon_pos))
     if warnings[gene] != '':
         out_str_list.append('WARNING ' + warnings[gene])
     if skipping:
         out_str_list.append('skipping!')
     if self.args.outfname is None:
         print ''.join(out_str_list)
     else:
         out_str_list.append('\n')
         self.outfile.write(''.join(out_str_list))
示例#12
0
    seq = glfo['seqs'][args.region][gene]
    pos = codon_positions[gene]
    if pos < ref_pos:  # align the codon position in the case that this seq is shorter up to the codon
        seq = (ref_pos - pos) * '-' + seq
        pos += (ref_pos - pos)

    right_pad_str = ''  # i think i don't need this any more since i have the align option in color_mutants
    # if len(seq) < max_seq_len:
    #     right_pad_str = (max_seq_len - len(seq)) * ' '

    emph_positions = None if args.region == 'd' else [
        pos + i for i in range(3)
    ]
    colored_seq, isnps = utils.color_mutants(ref_seq,
                                             seq,
                                             return_isnps=True,
                                             emphasis_positions=emph_positions,
                                             align=True)
    seqstrs[igene] += '%s%s' % (colored_seq, right_pad_str)
    if len(isnps) > 0:
        snpstrs[igene] = '%2d (%s)' % (len(isnps), ' '.join(
            [str(i) for i in isnps]))


# ----------------------------------------------------------------------------------------
def print_str(gene, seqstr, snpstr):
    return '%s  %s  %s  %s' % (
        utils.color_gene(gene, width=gene_str_width), seqstr,
        utils.color_gene(gene, width=gene_str_width), snpstr)

    def parse_query_text(self, unique_id, query_info):
        if len(query_info
               ) == 0:  # one for the query sequence, then one for v, d, and j
            print 'no info for', unique_id
            return {}
        elif len(query_info) < 4:
            regions_ok = ''
            for info in query_info:
                for region in utils.regions:
                    if 'IGH' + region.upper() in info:
                        regions_ok += region
            for region in utils.regions:
                if region not in regions_ok:
                    print '    ERROR no %s matches' % region
                    return {}
            assert False  # shouldn't get here
        elif len(query_info) != 4:
            print 'info for', unique_id, 'all messed up'
            for info in query_info:
                print info
            sys.exit()

        full_qr_seq = query_info[0].replace('>', '').replace(
            unique_id, '')  # strip off the unique id
        full_qr_seq = ''.join(full_qr_seq.split()).upper(
        )  # strip off white space and uppercase it
        assert full_qr_seq == self.seqinfo[unique_id]['seq']

        line = {}
        line['unique_id'] = unique_id
        line['seq'] = full_qr_seq
        for ireg in range(len(utils.regions)):
            region = utils.regions[ireg]
            info = query_info[ireg + 1].splitlines()
            while unique_id not in info[
                    0]:  # remove the line marking cdr3 and framework regions
                info.pop(0)
            if len(info) <= 1:
                print info
            assert len(info) > 1
            assert len(info[0].split()) == 2
            qr_seq = info[0].split()[1].upper(
            )  # this line should be '<unique_id> .............<query_seq>'

            imatch = 1  # which match to take
            match_name = str(info[imatch].split()[2])
            # if 'IGHV3-69' in match_name:  # it's not right anyway
            #     line['failed'] = True
            #     return line
            # while unacceptable_match(match_name, self.germline_seqs):
            #         imatch += 1
            #         match_name = str(info[imatch].split()[2])
            #         print '    new match name: %s' % match_name

            gl_seq = info[imatch].split()[4].upper()
            if qr_seq.replace('.', '') not in self.seqinfo[unique_id]['seq']:
                if self.args.debug:
                    print '    qr_seq not foundin seqinfo'
                line['failed'] = True
                return line

            if self.args.debug:
                print '  ', region, match_name
                print '    gl', gl_seq
                print '      ', qr_seq

            # replace the dots (gaps) in the gl match
            new_qr_seq, new_gl_seq = [], []
            for inuke in range(min(len(qr_seq), len(gl_seq))):
                if gl_seq[inuke] == '.':
                    pass
                else:
                    new_qr_seq.append(
                        qr_seq[inuke]
                    )  # this should only be out of range if the v match extends through the whole query sequence, i.e. friggin never
                    new_gl_seq.append(gl_seq[inuke])
            for inuke in range(len(gl_seq), len(qr_seq)):
                new_qr_seq.append(qr_seq[inuke])
            for inuke in range(len(qr_seq), len(gl_seq)):
                new_gl_seq.append(gl_seq[inuke])
            qr_seq = ''.join(new_qr_seq)
            gl_seq = ''.join(new_gl_seq)

            # work out the erosions
            qr_ldots = qr_seq.rfind(
                '.') + 1  # first strip off any dots on the left of query seq
            qr_seq = qr_seq[qr_ldots:]
            gl_seq = gl_seq[qr_ldots:]
            gl_ldots = gl_seq.rfind(
                '.') + 1  # then remove dots on the left of the germline seq
            qr_seq = qr_seq[gl_ldots:]
            gl_seq = gl_seq[gl_ldots:]
            del_5p = qr_ldots + gl_ldots
            jf_insertion = ''
            if region == 'j':
                jf_insertion = qr_seq[len(gl_seq):]
            qr_seq = qr_seq[:len(
                gl_seq
            )]  # then strip the right-hand portion of the query sequence that isn't aligned to the germline
            del_3p = len(gl_seq) - len(
                qr_seq
            )  # then do the same for the germline overhanging on the right of the query
            gl_seq = gl_seq[:len(qr_seq)]
            assert len(gl_seq) == len(qr_seq)

            new_gl_seq = []
            for inuke in range(len(gl_seq)):  # replace dashes (matched bases)
                assert gl_seq[inuke] != '.'  # hoping there's no gaps in here
                if gl_seq[inuke] == '-':
                    new_gl_seq.append(qr_seq[inuke])
                else:
                    new_gl_seq.append(gl_seq[inuke])
            gl_seq = ''.join(new_gl_seq)

            if match_name not in self.germline_seqs[region]:
                print '    ERROR couldn\'t find %s in germlines' % match_name
                line['failed'] = True
                return line

            if self.germline_seqs[region][match_name].find(
                    gl_seq
            ) != del_5p:  # why the *@*!! can't they make this consistent?
                if self.germline_seqs[region][match_name].find(gl_seq) < 0:
                    print 'whooooaa'
                    print self.germline_seqs[region][match_name]
                    print gl_seq
                    sys.exit()
                del_5p += self.germline_seqs[region][match_name].find(gl_seq)

            try:
                assert del_5p + len(gl_seq) + del_3p + len(
                    jf_insertion) == len(
                        self.germline_seqs[region][match_name])
            except:
                print '    ERROR lengths failed for %s' % unique_id
                # print del_5p, len(gl_seq), del_3p, del_5p + len(gl_seq) + del_3p , len(self.germline_seqs[region][match_name])
                # print gl_seq
                # print self.germline_seqs[region][match_name]
                line['failed'] = True
                return line
                # assert False

            if self.args.debug:
                utils.color_mutants(gl_seq,
                                    qr_seq,
                                    ref_label='gl ',
                                    extra_str='    ',
                                    print_result=True,
                                    post_str='    del: %d %d' %
                                    (del_5p, del_3p))

            # try:
            #     match_name = joinparser.figure_out_which_damn_gene(self.germline_seqs, match_name, gl_seq, debug=self.args.debug)
            # except:
            #     print 'ERROR couldn\'t figure out the gene for %s' % match_name
            #     return {}

            line[region + '_gene'] = match_name
            line[region + '_qr_seq'] = qr_seq
            line[region + '_gl_seq'] = gl_seq
            line[region + '_5p_del'] = del_5p
            line[region + '_3p_del'] = del_3p
            if region == 'j':
                line['jf_insertion'] = jf_insertion

        return line
示例#14
0
    def make_single_tree(self, partitions, annotations, uid_set, get_fasttrees=False, n_max_cons_seqs=10, debug=False):
        # NOTE don't call this externally -- if you want a single tree, call make_trees() with <i_only_cluster> set
        def getline(uidstr, uid_set=None):
            if uidstr in annotations:  # if we have this exact annotation
                return annotations[uidstr]
            else:
                if uid_set is None:
                    uid_set = set(uidstr.split(':'))  # should only get called if it's a singleton
                # note that for internal nodes in a fasttree-derived subtree, the uids will be out of order compared the the annotation keys
                for line in annotations.values():  # we may actually have the annotation for every subcluster (e.g. if --calculate-alternative-annotations was set), but in case we don't, this is fine
                    if len(uid_set & set(line['unique_ids'])) > 0:  # just take the first one with any overlap. Yeah, it's not necessarily the best, but its naive sequence probably isn't that different, and for just getting the fasttree it reeeeeeaaaallly doesn't matter
                        return line
            raise Exception('couldn\'t find uid %s in annotations' % uid)
        def getseq(uid):
            line = getline(uid)
            return line['seqs'][line['unique_ids'].index(uid)]
        def lget(uid_list):
            return ':'.join(uid_list)

        # check for repeated uids (was only from seed uid, which shouldn't happen any more, but the code below throws an infinite loop if we do, so may as well be careful)
        for partition in partitions:
            if sum(len(c) for c in partition) > len(set(u for c in partition for u in c)):
                repeated_uids = [u for u, count in collections.Counter([u for c in partition for u in c]).items() if count > 1]
                raise Exception('found %d uid%s in more than one cluster (%s)' % (len(repeated_uids), utils.plural(len(repeated_uids)), ', '.join(repeated_uids)))

        default_edge_length = 999999  # it's nice to have the edges all set to something that's numeric (so the trees print), but also obvious wrong, if we forget to set somebody
        assert len(partitions[-1]) == 1
        root_label = lget(partitions[-1][0])  # we want the order of the uids in the label to correspond to the order in self.partitions
        tns = dendropy.TaxonNamespace([root_label])
        root_node = dendropy.Node(taxon=tns.get_taxon(root_label))
        root_node.uids = uid_set  # each node keeps track of the uids of its children
        dtree = dendropy.Tree(taxon_namespace=tns, seed_node=root_node)
        if debug:
            print '    starting tree with %d leaves' % len(uid_set)
        for ipart in reversed(range(len(partitions) - 1)):  # dendropy seems to only have fcns to build a tree from the root downward, so we loop starting with the last partition (- 1 is because the last partition is guaranteed to be just one cluster)
            for lnode in dtree.leaf_node_iter():  # look for leaf nodes that contain uids from two clusters in this partition, and add those as children
                tclusts = [c for c in partitions[ipart] if len(set(c) & lnode.uids) > 0]
                if len(tclusts) < 2:
                    continue
                for tclust in tclusts:
                    ttaxon = dendropy.Taxon(lget(tclust))
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set(tclust)
                if debug:
                    print '      ipart %d' % ipart
                    print '        split node: %d --> %s      %s --> %s' % (len(lnode.uids), ' '.join([str(len(tc)) for tc in tclusts]), lnode.taxon.label, ' '.join([c.taxon.label for c in lnode.child_node_iter()]))

        # split existing leaves, which are probably not singletons (they're probably from the initial naive sequence collapse step) into subtrees such that each leaf is a singleton
        for lnode in dtree.leaf_node_iter():
            if len(lnode.uids) == 1:
                continue
            if get_fasttrees and len(lnode.uids) > 2:
                seqfos = [{'name' : uid, 'seq' : getseq(uid)} for uid in lnode.taxon.label.split(':')]  # may as well add them in the right order, although I don't think it matters
                subtree = treeutils.get_fasttree_tree(seqfos, getline(lnode.taxon.label, uid_set=lnode.uids)['naive_seq'], suppress_internal_node_taxa=True)  # note that the fasttree distances get ignored below (no idea if they'd be better than what we set down there, but they probably wouldn't be consistent, so I'd rather ignore them)
                for tmpnode in subtree.postorder_node_iter():
                    if tmpnode.is_leaf():
                        tmpnode.uids = set([tmpnode.taxon.label])
                    else:
                        tmpnode.uids = set([uid for c in tmpnode.child_node_iter() for uid in c.uids])
                        ttaxon = dendropy.Taxon(lget(tmpnode.uids))
                        subtree.taxon_namespace.add_taxon(ttaxon)
                        tmpnode.taxon = ttaxon  # ...and use the string of leaf nodes, even though they'll be in the wrong order (I think these get ignored when I call label_nodes() below, but it's still tidier to have them right in the meantime, and anyway since I'm suppressing internal taxa I think I need to set them to something)

                if debug:
                    print '   adding subtree with %d leaves from fastree at leaf node %s' % (len(seqfos), lnode.taxon.label)
                    print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=subtree))
                dtree.taxon_namespace.add_taxa(subtree.taxon_namespace)
                lnode.add_child(subtree.seed_node)
                assert len(lnode.child_edges()) == 1  # we're iterating over leaves, so this should always be true
                lnode.child_edges()[0].collapse()
            else:  # just add a star subtree
                for uid in lnode.taxon.label.split(':'):  # may as well add them in the right order, although I don't think it matters
                    ttaxon = dendropy.Taxon(uid)
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set([uid])
                if debug:
                    print '      added %d singleton children for %s' % (len(lnode.uids), lnode.taxon.label)

        # in order to set edge lengths, we need node sequences, so first set leaf node seqs
        for lnode in dtree.leaf_node_iter():
            assert len(lnode.uids) == 1
            lnode.seq = getseq(lnode.taxon.label)
            lnode.n_descendent_leaves = 1  # keep track of how many leaf nodes contributed to each node's consensus sequence (these are leaves, so it's trivally 1). This is less accurate than keeping track of all the sequences, but also faster

        # then set internal node seqs as the consensus of their children, and set the distance as hamming distance to child seqs
        if debug:
            print '    adding edge lengths either from fasttree %s or cons seq %s' % (utils.color('blue', 'x'), utils.color('red', 'x'))
        min_edge_length = None  # setting this is nice for better debug viewing
        for node in dtree.postorder_internal_node_iter():  # includes root node
            child_cons_seq_counts = [c.n_descendent_leaves for c in node.child_node_iter()]
            total_descendent_leaves = sum(child_cons_seq_counts)
            if total_descendent_leaves > n_max_cons_seqs:  # if there's tons of descendent leaves, we don't want to pass them all to the consensus fcn since it's slow, so we choose them in proportion to their actual proportions, but scaled down to <n_max_cons_seqs>
                child_cons_seq_counts = [int(n_max_cons_seqs * csc / float(total_descendent_leaves)) for csc in child_cons_seq_counts]
                child_cons_seq_counts = [max(1, csc) for csc in child_cons_seq_counts]  # don't eliminate any sequences entirely (this makes the proportions less accurate (in some cases), but is the easy way to handle the case where there's a ton of singleton children
            if debug:
                print '  %s' % utils.color('green', node.taxon.label)
                csc_str = '  (reduced: %s)' % ' '.join([str(csc) for csc in child_cons_seq_counts]) if total_descendent_leaves > n_max_cons_seqs else ''
                print '      desc leaves per child: %s%s' % (' '.join(str(c.n_descendent_leaves) for c in node.child_node_iter()), csc_str)
            child_seqfos = [{'name' : cn.taxon.label + '-leaf-' + str(il), 'seq' : cn.seq} for cn, count in zip(node.child_node_iter(), child_cons_seq_counts) for il in range(count)]
            node.seq = utils.cons_seq(0.01, aligned_seqfos=child_seqfos, tie_resolver_seq=getline(root_label)['naive_seq'])  #, debug=debug)  # the consensus has an N at every position where the constituent sequences gave a tie. But Ns screw up the distances (especially because once we *get* an N, we can't get rid of it and it's propagated all the way up the tree), and in almost all cases the correct choice should be the naive base, so we use that
            node.n_descendent_leaves = total_descendent_leaves
            for edge in node.child_edge_iter():
                from_fasttree = False
                if edge.length == default_edge_length:  # otherwise it was set by fasttree, and it's probably better than what we'd get from this (it'd be nice to skip the cons seq stuff for the whole fasttree subtree, but then we don't have the cons seqs we need for later)
                    edge.length = utils.hamming_distance(edge.head_node.seq, node.seq) / float(len(node.seq))
                else:
                    from_fasttree = True
                if min_edge_length is not None:
                    edge.length = max(min_edge_length, edge.length)
                if debug:
                    print '       %6.3f   %s  %s' % (edge.length, utils.color('blue' if from_fasttree else 'red', 'x'), edge.head_node.taxon.label)

        if debug:
            print '        naive seq %s' % getline(root_label)['naive_seq'] # NOTE might be worthwhile to add an edge connecting seed node and the actual naive sequence (i.e. for cases where our approximate naive is off)
            print '    root cons seq %s' % utils.color_mutants(getline(root_label)['naive_seq'], dtree.seed_node.seq)

        for node in dtree.preorder_node_iter():
            del node.uids
            del node.seq
            del node.n_descendent_leaves

        treeutils.label_nodes(dtree, ignore_existing_internal_node_labels=True, ignore_existing_internal_taxon_labels=True, debug=debug)
        dtree.update_bipartitions()  # probably don't really need this
        if debug:
            print treeutils.utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=dtree, width=250))

        return dtree
示例#15
0
    def parse_query_text(self, unique_id, query_info):
        if len(query_info) == 0:  # one for the query sequence, then one for v, d, and j
            print 'no info for',unique_id
            return {}
        elif len(query_info) < 4:
            regions_ok = ''
            for info in query_info:
                for region in utils.regions:
                    if 'IGH' + region.upper() in info:
                        regions_ok += region
            for region in utils.regions:
                if region not in regions_ok:
                    print '    ERROR no %s matches' % region
                    return {}
            assert False  # shouldn't get here
        elif len(query_info) != 4:
            print 'info for', unique_id, 'all messed up'
            for info in query_info:
                print info
            sys.exit()

        full_qr_seq = query_info[0].replace('>', '').replace(unique_id, '')  # strip off the unique id
        full_qr_seq = ''.join(full_qr_seq.split()).upper()  # strip off white space and uppercase it
        assert full_qr_seq == self.seqinfo[unique_id]['seq']

        line = {}
        line['unique_id'] = unique_id
        line['seq'] = full_qr_seq
        for ireg in range(len(utils.regions)):
            region = utils.regions[ireg]
            info = query_info[ireg + 1].splitlines()
            while unique_id not in info[0]:  # remove the line marking cdr3 and framework regions
                info.pop(0)
            if len(info) <= 1:
                print info
            assert len(info) > 1
            assert len(info[0].split()) == 2
            qr_seq = info[0].split()[1].upper()  # this line should be '<unique_id> .............<query_seq>'

            true_gene = self.seqinfo[unique_id][region + '_gene']
            imatch = 1  # which match to take
            match_name = str(info[imatch].split()[2])
            while match_name in just_always_friggin_skip and len(info) > imatch+1 and len(info[imatch+1].split()) > 2:
                imatch += 1
                old_one = match_name
                match_name = str(info[imatch].split()[2])
                if self.args.debug:
                    print '    %s: taking next match: %s --> %s)' % (unique_id, utils.color_gene(old_one), utils.color_gene(match_name))

            infer_gene = match_name
            for gset in equivalent_genes:
                if match_name in gset and true_gene in gset and match_name != true_gene:  # if the true gene and the inferred gene are in the same equivalence set, treat it as correct, i.e. just pretend it inferred the right name
                    if self.args.debug:
                        print '   %s: replacing name %s with true name %s' % (unique_id, match_name, true_gene)
                    infer_gene = true_gene

            # ----------------------------------------------------------------------------------------
            # skipping bullshit
            def skip_gene(gene):
                print '    %s in list of genes to skip' % utils.color_gene(gene)
                if gene not in genes_actually_skipped:
                    genes_actually_skipped[gene] = 0
                genes_actually_skipped[gene] += 1
                line['skip_gene'] = True

            if infer_gene not in self.germline_seqs[region]:
                print '    couldn\'t find %s in germlines (skipping)' % infer_gene
                skip_gene(infer_gene)
                return line

            if infer_gene in just_always_friggin_skip:
                skip_gene(infer_gene)
                return line
            if true_gene in just_always_friggin_skip:
                skip_gene(true)
                return line

            if not self.args.dont_skip_or15_genes and '/OR1' in true_gene:
                skip_gene(true_gene)
                return line

            if self.args.skip_missing_genes:
                if infer_gene in genes_to_skip:
                    skip_gene(infer_gene)
                    return line
                if true_gene in genes_to_skip:
                    skip_gene(true_gene)
                    return line

            gl_seq = info[imatch].split()[4].upper()
            if qr_seq.replace('.', '') not in self.seqinfo[unique_id]['seq']:
                # if self.args.debug:
                print '    qr_seq not found in seqinfo'
                line['failed'] = True
                return line

            if self.args.debug:
                if utils.are_alleles(infer_gene, true_gene):
                    regionstr = utils.color('bold', utils.color('blue', region))
                    truestr = '(originally %s)' % match_name
                else:
                    regionstr = utils.color('bold', utils.color('red', region))
                    truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '')
                print '  %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr)
                print '    gl', gl_seq
                print '      ', qr_seq

            # replace the dots (gaps) in the gl match
            new_qr_seq, new_gl_seq = [], []
            for inuke in range(min(len(qr_seq), len(gl_seq))):
                if gl_seq[inuke] == '.':
                    pass
                else:
                    new_qr_seq.append(qr_seq[inuke])  # this should only be out of range if the v match extends through the whole query sequence, i.e. friggin never
                    new_gl_seq.append(gl_seq[inuke])
            for inuke in range(len(gl_seq), len(qr_seq)):
                new_qr_seq.append(qr_seq[inuke])
            for inuke in range(len(qr_seq), len(gl_seq)):
                new_gl_seq.append(gl_seq[inuke])
            qr_seq = ''.join(new_qr_seq)
            gl_seq = ''.join(new_gl_seq)

            # work out the erosions
            qr_ldots = qr_seq.rfind('.') + 1  # first strip off any dots on the left of query seq
            qr_seq = qr_seq[qr_ldots : ]
            gl_seq = gl_seq[qr_ldots : ]
            gl_ldots = gl_seq.rfind('.') + 1  # then remove dots on the left of the germline seq
            qr_seq = qr_seq[gl_ldots : ]
            gl_seq = gl_seq[gl_ldots : ]
            del_5p = qr_ldots + gl_ldots
            jf_insertion = ''
            if region == 'j':
                jf_insertion = qr_seq[len(gl_seq) : ]
            qr_seq = qr_seq[ : len(gl_seq)]  # then strip the right-hand portion of the query sequence that isn't aligned to the germline
            del_3p = len(gl_seq) - len(qr_seq)  # then do the same for the germline overhanging on the right of the query
            gl_seq = gl_seq[ : len(qr_seq)]
            assert len(gl_seq) == len(qr_seq)

            new_gl_seq = []
            for inuke in range(len(gl_seq)):  # replace dashes (matched bases)
                assert gl_seq[inuke] != '.'  # hoping there's no gaps in here
                if gl_seq[inuke] == '-':
                    new_gl_seq.append(qr_seq[inuke])
                else:
                    new_gl_seq.append(gl_seq[inuke])
            gl_seq = ''.join(new_gl_seq)

            if self.germline_seqs[region][infer_gene].find(gl_seq) != del_5p:  # why the *@*!! can't they make this consistent?
                if self.germline_seqs[region][infer_gene].find(gl_seq) < 0:
                    print 'whooooaa'
                    print self.germline_seqs[region][infer_gene]
                    print gl_seq
                    line['failed'] = True
                    return line
                del_5p += self.germline_seqs[region][infer_gene].find(gl_seq)

            try:
                assert del_5p + len(gl_seq) + del_3p + len(jf_insertion) == len(self.germline_seqs[region][infer_gene])
            except:
                print '    ERROR lengths failed for %s' % unique_id
                # print del_5p, len(gl_seq), del_3p, del_5p + len(gl_seq) + del_3p , len(self.germline_seqs[region][infer_gene])
                # print gl_seq
                # print self.germline_seqs[region][infer_gene]
                line['failed'] = True
                return line
                # assert False

            if self.args.debug:
                utils.color_mutants(gl_seq, qr_seq, ref_label='gl ', extra_str='    ', print_result=True, post_str='    del: %d %d' % (del_5p, del_3p))

            # try:
            #     infer_gene = joinparser.figure_out_which_damn_gene(self.germline_seqs, infer_gene, gl_seq, debug=self.args.debug)
            # except:
            #     print 'ERROR couldn\'t figure out the gene for %s' % infer_gene
            #     return {}

            line[region + '_gene'] = infer_gene
            line[region + '_qr_seq'] = qr_seq
            line[region + '_gl_seq'] = gl_seq
            line[region + '_5p_del'] = del_5p
            line[region + '_3p_del'] = del_3p
            if region == 'j':
                line['jf_insertion'] = jf_insertion
            
        return line
示例#16
0
def read_sequence_file(infname, is_data, n_max_queries=-1, args=None, simglfo=None, quiet=False, more_input_info=None):
    # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils)
    yaml_glfo = None
    suffix = utils.getsuffix(infname)
    if suffix in delimit_info:
        seqfile = open(infname)  # closes on function exit. no, this isn't the best way to do this
        reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix])
    elif suffix in ['.fa', '.fasta', '.fq', '.fastq', '.fastx']:
        add_info = args is not None and args.name_column is not None and 'fasta-info-index' in args.name_column
        reader = utils.read_fastx(infname, name_key='unique_ids', seq_key='input_seqs', add_info=add_info, sanitize_uids=True, n_max_queries=n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
                                  queries=(args.queries if (args is not None and not args.abbreviate) else None))  # NOTE also can't filter on args.queries here if we're also translating
    elif suffix == '.yaml':
        yaml_glfo, reader, _ = utils.read_yaml_output(infname, n_max_queries=n_max_queries, synth_single_seqs=True, dont_add_implicit_info=True)  # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m.
        if not is_data:
            simglfo = yaml_glfo  # doesn't replace the contents, of course, which is why we return it
    else:
        raise Exception('unhandled file extension \'%s\' on file \'%s\'' % (suffix, infname))

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    n_duplicate_uids = 0
    printed_simu_mismatch_warning = False
    n_queries_added = 0
    found_seed = False
    potential_names, used_names = None, None  # for abbreviating
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                if 'infostrs' in line and args.name_column.split('-')[:3] == ['fasta', 'info', 'index']:
                    assert len(args.name_column.split('-')) == 4
                    line['unique_ids'] = line['infostrs'][int(args.name_column.split('-')[3])]
                else:
                    line['unique_ids'] = line[args.name_column]
                    del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception('couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname)
        if suffix != '.yaml':
            utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            if n_duplicate_uids == 0:
                print '  %s duplicate uid(s) in input file, so renaming by appending integer string, e.g. \'%s\' --> \'%s\'' % (utils.color('yellow', 'warning'), uid, new_uid)
            n_duplicate_uids += 1
            uid = new_uid  # if you decide you want to change it also in <reco_info>, don't forget to also modify the tree (and maybe other stuff, hence why I don't want to do it)
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid, potential_names, used_names = utils.choose_new_uid(potential_names, used_names)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line['reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname))

        if any(c not in utils.alphabet for c in inseq):  # NOTE should really be integrated with sanitize_seqs arg in utils.read_fastx()
            unexpected_chars = set([ch for ch in inseq if ch not in utils.alphabet])
            raise Exception('unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s' % (utils.plural(len(unexpected_chars)), ', '.join([('\'%s\'' % ch) for ch in unexpected_chars]), utils.alphabet, uid, inseq))

        # da business
        input_info[uid] = {'unique_ids' : [uid, ], 'seqs' : [inseq, ]}

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = line  # this used to be deepcopy'd, but it's really slow and i'm really pretty sure it's not necessary
            if uid != line['unique_ids'][0] and not printed_simu_mismatch_warning:
                print '     note: uid in simulation info %s doesn\'t match input file uid %s (latter was probably changed above). Simulation info will be internally consistent, but the key indexing that info in <reco_info> will be different, since it corresponds to the newly chosen uid above.' % (uid, line['unique_ids'][0])
                printed_simu_mismatch_warning = True
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])
            for line_key in utils.input_metafile_keys.values():
                if line_key in reco_info[uid]:  # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else))
                    input_info[uid][line_key] = copy.deepcopy(reco_info[uid][line_key])  # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation)

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(input_info)
            break

    if n_duplicate_uids > 0:
        print '  %s renamed %d duplicate uids from %s' % (utils.color('yellow', 'warning'), n_duplicate_uids, infname)

    if more_input_info is not None:  # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation)
        if len(set(more_input_info) & set(input_info)) > 0:  # check for sequences in both places
            common_uids = set(more_input_info) & set(input_info)
            print '  note: found %d queries in both --infname and --queries-to-include-fname: %s' % (len(common_uids), ' '.join(common_uids))  # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files
            differing_seqs = [q for q in common_uids if more_input_info[q]['seqs'][0] != input_info[q]['seqs'][0]]
            if len(differing_seqs) > 0:  # if they have different sequences, though, that's a problem
                for q in differing_seqs:
                    print q
                    utils.color_mutants(input_info[q]['seqs'][0], more_input_info[q]['seqs'][0], align_if_necessary=True, print_result=True, ref_label='  --infname  ', seq_label='  --queries-to-include-fname  ')
                raise Exception('inconsistent sequences for %d of the queries in both --infname and --queries-to-include-fname (see preceding lines)' % len(differing_seqs))
        if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info:
            found_seed = True
        input_info.update(more_input_info)
    if args is not None and args.input_metafname is not None:
        read_input_metafo(args.input_metafname, input_info.values(), debug=True)
    post_process(input_info, reco_info, args, infname, found_seed, is_data, iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info, yaml_glfo
    def parse_query_text(self, unique_id, query_info):
        if len(query_info
               ) == 0:  # one for the query sequence, then one for v, d, and j
            print 'no info for', unique_id
            return {}
        elif len(query_info) < 4:
            regions_ok = ''
            for info in query_info:
                for region in utils.regions:
                    if 'IGH' + region.upper() in info:
                        regions_ok += region
            for region in utils.regions:
                if region not in regions_ok:
                    print '    ERROR no %s matches' % region
                    return {}
            assert False  # shouldn't get here
        elif len(query_info) != 4:
            print 'info for', unique_id, 'all messed up'
            for info in query_info:
                print info
            sys.exit()

        full_qr_seq = query_info[0].replace('>', '').replace(
            unique_id, '')  # strip off the unique id
        full_qr_seq = ''.join(full_qr_seq.split()).upper(
        )  # strip off white space and uppercase it
        assert full_qr_seq == self.seqinfo[unique_id]['seq']

        line = {}
        line['unique_id'] = unique_id
        line['seq'] = full_qr_seq
        for ireg in range(len(utils.regions)):
            region = utils.regions[ireg]
            info = query_info[ireg + 1].splitlines()
            while unique_id not in info[
                    0]:  # remove the line marking cdr3 and framework regions
                info.pop(0)
            if len(info) <= 1:
                print info
            assert len(info) > 1
            assert len(info[0].split()) == 2
            qr_seq = info[0].split()[1].upper(
            )  # this line should be '<unique_id> .............<query_seq>'

            true_gene = self.seqinfo[unique_id][region + '_gene']
            imatch = 1  # which match to take
            match_name = str(info[imatch].split()[2])
            while match_name in just_always_friggin_skip and len(
                    info) > imatch + 1 and len(info[imatch + 1].split()) > 2:
                imatch += 1
                old_one = match_name
                match_name = str(info[imatch].split()[2])
                if self.args.debug:
                    print '    %s: taking next match: %s --> %s)' % (
                        unique_id, utils.color_gene(old_one),
                        utils.color_gene(match_name))

            infer_gene = match_name
            for gset in equivalent_genes:
                if match_name in gset and true_gene in gset and match_name != true_gene:  # if the true gene and the inferred gene are in the same equivalence set, treat it as correct, i.e. just pretend it inferred the right name
                    if self.args.debug:
                        print '   %s: replacing name %s with true name %s' % (
                            unique_id, match_name, true_gene)
                    infer_gene = true_gene

            # ----------------------------------------------------------------------------------------
            # skipping bullshit
            def skip_gene(gene):
                print '    %s in list of genes to skip' % utils.color_gene(
                    gene)
                if gene not in genes_actually_skipped:
                    genes_actually_skipped[gene] = 0
                genes_actually_skipped[gene] += 1
                line['skip_gene'] = True

            if infer_gene not in self.germline_seqs[region]:
                print '    couldn\'t find %s in germlines (skipping)' % infer_gene
                skip_gene(infer_gene)
                return line

            if infer_gene in just_always_friggin_skip:
                skip_gene(infer_gene)
                return line
            if true_gene in just_always_friggin_skip:
                skip_gene(true)
                return line

            if not self.args.dont_skip_or15_genes and '/OR1' in true_gene:
                skip_gene(true_gene)
                return line

            if self.args.skip_missing_genes:
                if infer_gene in genes_to_skip:
                    skip_gene(infer_gene)
                    return line
                if true_gene in genes_to_skip:
                    skip_gene(true_gene)
                    return line

            gl_seq = info[imatch].split()[4].upper()
            if qr_seq.replace('.', '') not in self.seqinfo[unique_id]['seq']:
                # if self.args.debug:
                print '    qr_seq not found in seqinfo'
                line['failed'] = True
                return line

            if self.args.debug:
                if utils.are_alleles(infer_gene, true_gene):
                    regionstr = utils.color('bold',
                                            utils.color('blue', region))
                    truestr = '(originally %s)' % match_name
                else:
                    regionstr = utils.color('bold', utils.color('red', region))
                    truestr = '(true: %s)' % utils.color_gene(
                        true_gene).replace(region, '')
                print '  %s %s %s' % (regionstr,
                                      utils.color_gene(infer_gene).replace(
                                          region, ''), truestr)
                print '    gl', gl_seq
                print '      ', qr_seq

            # replace the dots (gaps) in the gl match
            new_qr_seq, new_gl_seq = [], []
            for inuke in range(min(len(qr_seq), len(gl_seq))):
                if gl_seq[inuke] == '.':
                    pass
                else:
                    new_qr_seq.append(
                        qr_seq[inuke]
                    )  # this should only be out of range if the v match extends through the whole query sequence, i.e. friggin never
                    new_gl_seq.append(gl_seq[inuke])
            for inuke in range(len(gl_seq), len(qr_seq)):
                new_qr_seq.append(qr_seq[inuke])
            for inuke in range(len(qr_seq), len(gl_seq)):
                new_gl_seq.append(gl_seq[inuke])
            qr_seq = ''.join(new_qr_seq)
            gl_seq = ''.join(new_gl_seq)

            # work out the erosions
            qr_ldots = qr_seq.rfind(
                '.') + 1  # first strip off any dots on the left of query seq
            qr_seq = qr_seq[qr_ldots:]
            gl_seq = gl_seq[qr_ldots:]
            gl_ldots = gl_seq.rfind(
                '.') + 1  # then remove dots on the left of the germline seq
            qr_seq = qr_seq[gl_ldots:]
            gl_seq = gl_seq[gl_ldots:]
            del_5p = qr_ldots + gl_ldots
            jf_insertion = ''
            if region == 'j':
                jf_insertion = qr_seq[len(gl_seq):]
            qr_seq = qr_seq[:len(
                gl_seq
            )]  # then strip the right-hand portion of the query sequence that isn't aligned to the germline
            del_3p = len(gl_seq) - len(
                qr_seq
            )  # then do the same for the germline overhanging on the right of the query
            gl_seq = gl_seq[:len(qr_seq)]
            assert len(gl_seq) == len(qr_seq)

            new_gl_seq = []
            for inuke in range(len(gl_seq)):  # replace dashes (matched bases)
                assert gl_seq[inuke] != '.'  # hoping there's no gaps in here
                if gl_seq[inuke] == '-':
                    new_gl_seq.append(qr_seq[inuke])
                else:
                    new_gl_seq.append(gl_seq[inuke])
            gl_seq = ''.join(new_gl_seq)

            if self.germline_seqs[region][infer_gene].find(
                    gl_seq
            ) != del_5p:  # why the *@*!! can't they make this consistent?
                if self.germline_seqs[region][infer_gene].find(gl_seq) < 0:
                    print 'whooooaa'
                    print self.germline_seqs[region][infer_gene]
                    print gl_seq
                    line['failed'] = True
                    return line
                del_5p += self.germline_seqs[region][infer_gene].find(gl_seq)

            try:
                assert del_5p + len(gl_seq) + del_3p + len(
                    jf_insertion) == len(
                        self.germline_seqs[region][infer_gene])
            except:
                print '    ERROR lengths failed for %s' % unique_id
                # print del_5p, len(gl_seq), del_3p, del_5p + len(gl_seq) + del_3p , len(self.germline_seqs[region][infer_gene])
                # print gl_seq
                # print self.germline_seqs[region][infer_gene]
                line['failed'] = True
                return line
                # assert False

            if self.args.debug:
                utils.color_mutants(gl_seq,
                                    qr_seq,
                                    ref_label='gl ',
                                    extra_str='    ',
                                    print_result=True,
                                    post_str='    del: %d %d' %
                                    (del_5p, del_3p))

            # try:
            #     infer_gene = joinparser.figure_out_which_damn_gene(self.germline_seqs, infer_gene, gl_seq, debug=self.args.debug)
            # except:
            #     print 'ERROR couldn\'t figure out the gene for %s' % infer_gene
            #     return {}

            line[region + '_gene'] = infer_gene
            line[region + '_qr_seq'] = qr_seq
            line[region + '_gl_seq'] = gl_seq
            line[region + '_5p_del'] = del_5p
            line[region + '_3p_del'] = del_3p
            if region == 'j':
                line['jf_insertion'] = jf_insertion

        return line
示例#18
0
#     print '%s  %3d' % (utils.color_gene(g, width=20), len(s) - glfo['cyst-positions'][g])
# sys.exit()

# base = '4-59'
# a1, a2 = '12', '01'
# gene1, gene2 = 'IGHV' + base + '*' + a1, 'IGHV' + base + '*' + a2

genes = ['IG' + args.chain.upper() + args.region.upper() + args.base + '*' + al for al in args.alleles]
if args.other_genes is not None:
    genes += args.other_genes

codon_positions = glfo[utils.conserved_codons[args.chain][args.region] + '-positions'] if args.region != 'd' else None

def print_str(gene, seq):
    return '%s   %s' % (utils.color_gene(gene, width=15), seq)

ref_gene = genes[0]
ref_seq = glfo['seqs'][args.region][ref_gene]
print print_str(ref_gene, utils.color_mutants(ref_seq, ref_seq, emphasis_positions=None if args.region == 'd' else [codon_positions[ref_gene] + i for i in range(3)])), '   (reference)'

for igene in range(1, len(genes)):
    gene = genes[igene]
    seq = glfo['seqs'][args.region][gene]
    min_length = min(len(seq), len(ref_seq))
    colored_seq = utils.color_mutants(ref_seq[:min_length], seq[:min_length], print_isnps=True, emphasis_positions=None if args.region == 'd' else [codon_positions[gene] + i for i in range(3)])
    print print_str(gene, colored_seq)
    if min_length < len(ref_seq) and igene == 0:
        print 'extra for %s: %s' % (utils.color_gene(ref_gene), ref_seq[min_length:])
    if min_length < len(seq):
        print 'extra for %s: %s' % (utils.color_gene(gene), seq[min_length:])
示例#19
0
            sorted_clusters = sorted(
                [c for c in partitions[if2] if keyfcn(c) is not None],
                key=keyfcn
            )  # make a list of the clusters in the other partition that's sorted by how similar their naive sequence are
            nearest_cluster_lists[label1][label2].append(sorted_clusters)

            extra_str = ''
            inner_loop_str = ''
            if len(sorted_clusters) == 0:
                # extra_str = utils.color('yellow', '-', width=3)
                inner_loop_str = utils.color('yellow', '-    -', width=8)
            size_index_str = '%s %3d' % (utils.color(
                'blue',
                '%4d' % len(cluster1)), partitions[if1].index(cluster1))
            print '  %-3s%s   %8s        %-30s%3s' % (
                extra_str, size_index_str, inner_loop_str,
                cdr3_translation(info1), extra_str)
            for nclust in sorted_clusters:
                nclust_naive_cdr3 = cdr3_translation(
                    annotations[if2][getkey(nclust)])
                hdist = naive_hdist_or_none(info1,
                                            annotations[if2][getkey(nclust)])
                print '               %s %4d   %2s   %-30s' % (
                    utils.color('blue', '%4d' % len(nclust)),
                    partitions[if2].index(nclust),
                    '%d' % hdist if hdist > 0 else '',
                    utils.color_mutants(cdr3_translation(info1),
                                        nclust_naive_cdr3,
                                        amino_acid=True))
示例#20
0
    def parse_query_text(self, unique_id, query_info):
        if len(query_info) == 0:  # one for the query sequence, then one for v, d, and j
            print 'no info for',unique_id
            return {}
        elif len(query_info) < 4:
            regions_ok = ''
            for info in query_info:
                for region in utils.regions:
                    if 'IGH' + region.upper() in info:
                        regions_ok += region
            for region in utils.regions:
                if region not in regions_ok:
                    print '    ERROR no %s matches' % region
                    return {}
            assert False  # shouldn't get here
        elif len(query_info) != 4:
            print 'info for', unique_id, 'all messed up'
            for info in query_info:
                print info
            sys.exit()

        full_qr_seq = query_info[0].replace('>', '').replace(unique_id, '')  # strip off the unique id
        full_qr_seq = ''.join(full_qr_seq.split()).upper()  # strip off white space and uppercase it
        assert full_qr_seq == self.seqinfo[unique_id]['seq']

        line = {}
        line['unique_id'] = unique_id
        line['seq'] = full_qr_seq
        for ireg in range(len(utils.regions)):
            region = utils.regions[ireg]
            info = query_info[ireg + 1].splitlines()
            while unique_id not in info[0]:  # remove the line marking cdr3 and framework regions
                info.pop(0)
            if len(info) <= 1:
                print info
            assert len(info) > 1
            assert len(info[0].split()) == 2
            qr_seq = info[0].split()[1].upper()  # this line should be '<unique_id> .............<query_seq>'

            imatch = 1  # which match to take
            match_name = str(info[imatch].split()[2])
            # if 'IGHV3-69' in match_name:  # it's not right anyway
            #     line['failed'] = True
            #     return line
            # while unacceptable_match(match_name, self.germline_seqs):
            #         imatch += 1
            #         match_name = str(info[imatch].split()[2])
            #         print '    new match name: %s' % match_name

            gl_seq = info[imatch].split()[4].upper()
            if qr_seq.replace('.', '') not in self.seqinfo[unique_id]['seq']:
                if self.args.debug:
                    print '    qr_seq not foundin seqinfo'
                line['failed'] = True
                return line

            if self.args.debug:
                print '  ', region, match_name
                print '    gl', gl_seq
                print '      ', qr_seq

            # replace the dots (gaps) in the gl match
            new_qr_seq, new_gl_seq = [], []
            for inuke in range(min(len(qr_seq), len(gl_seq))):
                if gl_seq[inuke] == '.':
                    pass
                else:
                    new_qr_seq.append(qr_seq[inuke])  # this should only be out of range if the v match extends through the whole query sequence, i.e. friggin never
                    new_gl_seq.append(gl_seq[inuke])
            for inuke in range(len(gl_seq), len(qr_seq)):
                new_qr_seq.append(qr_seq[inuke])
            for inuke in range(len(qr_seq), len(gl_seq)):
                new_gl_seq.append(gl_seq[inuke])
            qr_seq = ''.join(new_qr_seq)
            gl_seq = ''.join(new_gl_seq)

            # work out the erosions
            qr_ldots = qr_seq.rfind('.') + 1  # first strip off any dots on the left of query seq
            qr_seq = qr_seq[qr_ldots : ]
            gl_seq = gl_seq[qr_ldots : ]
            gl_ldots = gl_seq.rfind('.') + 1  # then remove dots on the left of the germline seq
            qr_seq = qr_seq[gl_ldots : ]
            gl_seq = gl_seq[gl_ldots : ]
            del_5p = qr_ldots + gl_ldots
            jf_insertion = ''
            if region == 'j':
                jf_insertion = qr_seq[len(gl_seq) : ]
            qr_seq = qr_seq[ : len(gl_seq)]  # then strip the right-hand portion of the query sequence that isn't aligned to the germline
            del_3p = len(gl_seq) - len(qr_seq)  # then do the same for the germline overhanging on the right of the query
            gl_seq = gl_seq[ : len(qr_seq)]
            assert len(gl_seq) == len(qr_seq)

            new_gl_seq = []
            for inuke in range(len(gl_seq)):  # replace dashes (matched bases)
                assert gl_seq[inuke] != '.'  # hoping there's no gaps in here
                if gl_seq[inuke] == '-':
                    new_gl_seq.append(qr_seq[inuke])
                else:
                    new_gl_seq.append(gl_seq[inuke])
            gl_seq = ''.join(new_gl_seq)

            if match_name not in self.germline_seqs[region]:
                print '    ERROR couldn\'t find %s in germlines' % match_name
                line['failed'] = True
                return line

            if self.germline_seqs[region][match_name].find(gl_seq) != del_5p:  # why the *@*!! can't they make this consistent?
                if self.germline_seqs[region][match_name].find(gl_seq) < 0:
                    print 'whooooaa'
                    print self.germline_seqs[region][match_name]
                    print gl_seq
                    sys.exit()
                del_5p += self.germline_seqs[region][match_name].find(gl_seq)

            try:
                assert del_5p + len(gl_seq) + del_3p + len(jf_insertion) == len(self.germline_seqs[region][match_name])
            except:
                print '    ERROR lengths failed for %s' % unique_id
                # print del_5p, len(gl_seq), del_3p, del_5p + len(gl_seq) + del_3p , len(self.germline_seqs[region][match_name])
                # print gl_seq
                # print self.germline_seqs[region][match_name]
                line['failed'] = True
                return line
                # assert False

            if self.args.debug:
                utils.color_mutants(gl_seq, qr_seq, ref_label='gl ', extra_str='    ', print_result=True, post_str='    del: %d %d' % (del_5p, del_3p))

            # try:
            #     match_name = joinparser.figure_out_which_damn_gene(self.germline_seqs, match_name, gl_seq, debug=self.args.debug)
            # except:
            #     print 'ERROR couldn\'t figure out the gene for %s' % match_name
            #     return {}

            line[region + '_gene'] = match_name
            line[region + '_qr_seq'] = qr_seq
            line[region + '_gl_seq'] = gl_seq
            line[region + '_5p_del'] = del_5p
            line[region + '_3p_del'] = del_3p
            if region == 'j':
                line['jf_insertion'] = jf_insertion
            
        return line
示例#21
0
def reconstruct_indelfo_from_indel_list(indel_list,
                                        line,
                                        iseq,
                                        debug=False):  # old-style files
    if 'reversed_seq' in indel_list:  # handle super-old files
        print '%s encountered file with super old, unhandled indel format, proceeding, but indel info may be inconsistent' % (
            utils.color('red', 'error'))
        return

    line['indelfos'][iseq] = get_empty_indel()
    if len(indel_list) == 0:
        return

    ifo_positions = [ifo['pos'] for ifo in indel_list]
    if len(ifo_positions) != len(set(ifo_positions)):
        print '%s two indels at the same position, everything will be kinda messed up' % utils.color(
            'red', 'error')
    ifos_by_pos = {ifo['pos']: ifo for ifo in indel_list}
    qr_gap_seq, gl_gap_seq = [], []
    iqr, igl, iindel = 0, 0, 0
    if debug:
        print len(line['input_seqs'][iseq]), line['input_seqs'][iseq]
        print len(line['naive_seq']), line['naive_seq']
    while iqr < len(line['input_seqs'][iseq]):
        if debug:
            print '  %3d  %3d' % (iqr, igl),
        if igl >= len(
                line['naive_seq']
        ):  # if the pos is longer than the qr seq, we won't fall off the end (so i'm ignoring that case here), but we presumably will miss an indel and crash somewhere else. Note that I can't just check before the loop, since indel positions can be longer than the initial sequence lengths (i.e. before adding other indels)
            offending_indels = [
                ifo for p, ifo in ifos_by_pos.items()
                if p >= len(line['naive_seq'])
            ]
            print '%s %s indel position beyond end of sequence len %d (setting to invalid): %s' % (
                utils.color('red', 'error'), ':'.join(line['unique_ids']),
                len(line['naive_seq']), offending_indels)
            raise IndelfoReconstructionError(
            )  # no, I don't like doing it this way, I don't like using exceptions for control flow. But, this only happens when we read in an old file with ridiculous inconsistent info (so I can't fix the underlying wrong info), and there's no way to know it's inconsistent until we get to here (so I can't just throw it away earlier)
        if iindel in ifos_by_pos:
            ifo = ifos_by_pos[iindel]
            if ifo['type'] == 'insertion':
                if ifo['seqstr'] != line['input_seqs'][iseq][iqr:iqr +
                                                             ifo['len']]:
                    print '%s indel info seqstr doesn\'t match input seq str:' % utils.color(
                        'red', 'error')
                    utils.color_mutants(ifo['seqstr'],
                                        line['input_seqs'][iseq][iqr:iqr +
                                                                 ifo['len']],
                                        align=True,
                                        print_result=True,
                                        extra_str='        ')
                qr_gap_seq += ifo['seqstr'].split()
                gl_gap_seq += [ifo['len'] * utils.gap_chars[0]]
                if debug:
                    print '  %s    %s' % (ifo['seqstr'].split(),
                                          [ifo['len'] * utils.gap_chars[0]])
                iqr += ifo['len']
            else:
                if ifo['seqstr'] != line['naive_seq'][igl:igl + ifo['len']]:
                    print '%s indel info seqstr doesn\'t match naive seq str:' % utils.color(
                        'red', 'error')
                    utils.color_mutants(ifo['seqstr'],
                                        line['naive_seq'][igl:igl +
                                                          ifo['len']],
                                        align=True,
                                        print_result=True,
                                        extra_str='        ')
                qr_gap_seq += [ifo['len'] * utils.gap_chars[0]]
                gl_gap_seq += ifo['seqstr'].split()
                if debug:
                    print '  %s    %s' % ([ifo['len'] * utils.gap_chars[0]
                                           ], ifo['seqstr'].split())
                igl += ifo['len']
            del ifos_by_pos[iindel]
            iindel += ifo['len']
        else:
            qr_gap_seq += [line['input_seqs'][iseq][iqr]]
            gl_gap_seq += [line['naive_seq'][igl]]
            if debug:
                print '  %s    %s' % (line['input_seqs'][iseq][iqr],
                                      line['naive_seq'][igl])
            iqr += 1
            igl += 1
            iindel += 1

    line['indelfos'][iseq]['qr_gap_seq'] = ''.join(qr_gap_seq)
    line['indelfos'][iseq]['gl_gap_seq'] = ''.join(gl_gap_seq)
    line['indelfos'][iseq]['indels'] = indel_list
    line['indelfos'][iseq]['reversed_seq'] = line['indel_reversed_seqs'][iseq]
    line['indelfos'][iseq]['genes'] = {
        r: line[r + '_gene']
        for r in utils.regions
    }
    if debug:
        print '  reconstructed indelfo'
        print get_dbg_str(line['indelfos'][iseq])
示例#22
0
    def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False):
        """
        Hamming distance between the inferred naive sequence and the tue naive sequence.
        <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region.
        NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence.
        if <normalize> divide by sequence length
        """

        true_naive_seq = true_line['naive_seq']
        inferred_naive_seq = line['naive_seq']
        if len(true_naive_seq) != len(inferred_naive_seq):
            print '%20s    true      inf' % ''
            for k in true_line:
                print '%20s   %s' % (k, true_line[k]),
                if k in line:
                    print '   %s' % line[k]
                else:
                    print '    NOPE'
            for k in line:
                if k not in true_line:
                    print '  not in true line   %20s    %s' % (k, line[k])
            raise Exception('%s true and inferred sequences not the same length\n   %s\n   %s\n' % (line['unique_id'], true_naive_seq, inferred_naive_seq))

        # assert False # read through this whole damn thing and make sure it's ok

        left_hack_add_on = ''
        right_hack_add_on = ''
        # if len(true_line['seq']) > len(utils.remove_ambiguous_ends(line['seq'], line['fv_insertion'], line['jf_insertion'])):  # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on
        # # if len(true_naive_seq) > len(inferred_naive_seq):  # hm, now why did I use line['seq'] stuff before?
        #     assert False
        #     start = true_line['seq'].find(line['seq'])
        #     assert start >= 0
        #     end = len(line['seq']) + start
        #     left_hack_add_on = true_line['seq'][: start]
        #     right_hack_add_on = true_line['seq'][ end :]
        #     # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on)
        #     inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on)
        #     if debug:
        #         print '  adding to inferred naive seq'


        if padfo is not None:  # remove N padding from the inferred sequence
            if debug:
                print 'removing padfo'
                print inferred_naive_seq
            if inferred_naive_seq[padfo['padleft'] : ].count('N') == padfo['padleft']:  # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns
                inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ]
            elif debug:  # NOTE if no debug, we just fall through, which isok
                print 'tried to remove non Ns!\n   %s\n   padleft %d\n' % (inferred_naive_seq, padfo['padleft'])
            if padfo['padright'] > 0:
                if inferred_naive_seq[ : padfo['padright']].count('N') == padfo['padright']:  # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns
                    inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']]
                elif debug:  # NOTE if no debug, we just fall through, which isok
                    print 'tried to remove non Ns!\n   %s\n   padright %d\n' % (inferred_naive_seq, padfo['padright'])
            if debug:
                print padfo['padleft'] * ' ' + inferred_naive_seq + padfo['padleft'] * ' '

        bounds = None
        if restrict_to_region != '':
            bounds = true_line['regional_bounds'][restrict_to_region]
            if debug:
                print 'restrict to %s' % restrict_to_region
                utils.color_mutants(true_naive_seq, inferred_naive_seq, print_result=True, extra_str='      ')
                utils.color_mutants(true_naive_seq[bounds[0] : bounds[1]], inferred_naive_seq[bounds[0] : bounds[1]], print_result=True, extra_str='      ' + bounds[0]*' ')
            true_naive_seq = true_naive_seq[bounds[0] : bounds[1]]
            inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]]

        if len(true_naive_seq) != len(inferred_naive_seq):
            raise Exception('still not the same lengths for %s\n  %s\n  %s' % (query_name, true_naive_seq, inferred_naive_seq))
        fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True)
        total_distance = int(fraction * len_excluding_ambig)
        if len(true_naive_seq) == 0:
            print 'WARNING zero length sequence in hamming_distance_to_true_naive'
            return 0
        if normalize:
            return int(100 * (float(total_distance) / len(true_naive_seq)))
        else:
            return total_distance
示例#23
0
codon_positions = glfo[utils.conserved_codons[args.locus][args.region] +
                       '-positions'] if args.region != 'd' else None


def print_str(gene, seq):
    return '%s   %s' % (utils.color_gene(gene, width=15), seq)


ref_gene = genes[0]
ref_seq = glfo['seqs'][args.region][ref_gene]
print print_str(
    ref_gene,
    utils.color_mutants(ref_seq,
                        ref_seq,
                        emphasis_positions=None if args.region == 'd' else
                        [codon_positions[ref_gene] + i
                         for i in range(3)])), '   (reference)'

for igene in range(1, len(genes)):
    gene = genes[igene]
    seq = glfo['seqs'][args.region][gene]
    min_length = min(len(seq), len(ref_seq))
    colored_seq = utils.color_mutants(
        ref_seq[:min_length],
        seq[:min_length],
        print_isnps=True,
        emphasis_positions=None if args.region == 'd' else
        [codon_positions[gene] + i for i in range(3)])
    print print_str(gene, colored_seq)
    if min_length < len(ref_seq) and igene == 0: