def print_stuff(line):
    cluster_index = sorted_clusters.index(cluster)
    naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(
        line, iseq=0, restrict_to_region='cdr3'
    )  # line['naive_seq'][(line['codon_positions']['v']):((line['codon_positions']['j'])+3)] #get nt sequence of CDR3 from first base of cysteine through last base of tryptophan
    # mature_cdr3_seqs = []  # trying to translate the consensus cdr3 so I can search these with my seed seqs
    #     for iseq in range(len(line['unique_ids'])):
    #         naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3')
    #         mature_cdr3_seqs.append(mature_cdr3_seq)
    # translated_cdr3 = Seq().... not done
    cdr3_aa = '%-30s' % Seq(naive_cdr3).translate()
    if any('-ig' in s for s in line['unique_ids']):
        cdr3_aa = utils.color('red', cdr3_aa, width=30)
    print '%4s     %s %s %s %5d %5d %5d %7.3f   %8.4f     %2d   %s %4.2f' % (
        cluster_index,
        utils.color_gene(line['v_gene'], width=15),
        utils.color_gene(line['d_gene'], width=15),
        utils.color_gene(line['j_gene'], width=10),
        len(line['unique_ids']),
        numpy.mean(line['n_mutations']),
        numpy.median(line['n_mutations']),
        numpy.mean(line['mut_freqs']),
        float(len(cluster)) / n_total,
        (line['cdr3_length'] / 3),
        cdr3_aa,
        utils.fay_wu_h(line, debug=False),
    )
示例#2
0
 def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False):
     if self.debug < 2:
         return
     out_str_list = []
     buff_str = (20 - len(gene)) * ' '
     tmp_val = score
     if self.args.apply_choice_probs_in_sw and self.get_choice_prob(region, gene) != 0.0:
         tmp_val = score / self.get_choice_prob(region, gene)
     if self.args.apply_choice_probs_in_sw:
         out_str_list.append('%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score))
     else:
         out_str_list.append('%8s%s%s%9s%3s %6.0f        ' % (' ', utils.color_gene(gene), '', '', buff_str, score))
     out_str_list.append('%4d%4d   %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]))
     out_str_list.append('%46s  %4d%4d' % ('', qrbounds[0], qrbounds[1]))
     out_str_list.append('   %s ' % (utils.color_mutants(self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]])))
     if region != 'd':
         out_str_list.append('(%s %d)' % (utils.conserved_codon_names[region], codon_pos))
     if warnings[gene] != '':
         out_str_list.append('WARNING ' + warnings[gene])
     if skipping:
         out_str_list.append('skipping!')
     if self.args.outfname is None:
         print ''.join(out_str_list)
     else:
         out_str_list.append('\n')
         self.outfile.write(''.join(out_str_list))
示例#3
0
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo["template-gene"]
    region = utils.get_region(template_gene)
    if template_gene not in glfo["seqs"][region]:
        raise Exception("unknown template gene %s" % template_gene)

    new_gene = newfo["gene"]

    if region == "v":
        glfo["cyst-positions"][new_gene] = glfo["cyst-positions"][template_gene]
    elif region == "j":
        glfo["tryp-positions"][new_gene] = glfo["tryp-positions"][template_gene]

    glfo["seqs"][region][new_gene] = newfo["seq"]

    if debug:
        print "    adding new allele to glfo:"
        print "      template %s   %s" % (glfo["seqs"][region][template_gene], utils.color_gene(template_gene))
        print "           new %s   %s" % (
            utils.color_mutants(glfo["seqs"][region][template_gene], newfo["seq"]),
            utils.color_gene(new_gene),
        )

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
示例#4
0
    def make_transition_plot(self, gene_name, model):
        """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """
        fig, ax = plotting.mpl_init()
        fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

        ibin = 0
        print utils.color_gene(utils.unsanitize_name(gene_name))
        legend_colors = set()  # add a color to this the first time you plot it
        for state in model.states:

            # bin label
            ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8)

            sorted_to_states = {}
            for name in state.transitions.keys():
                if name.find('IG') == 0:
                    sorted_to_states[name] = int(paramutils.simplify_state_name(name))
                else:
                    sorted_to_states[name] = name
            sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1))

            total = 0.0
            for to_state, simple_to_state in sorted_to_states:

                prob = state.transitions[to_state]

                alpha = 0.6
                width = 3

                if 'insert' in str(simple_to_state):
                    label = 'insert'
                    color = '#3498db'  # blue
                elif str(simple_to_state) == 'end':
                    label = 'end'
                    color = 'red'
                else:  # regional/internal states
                    assert to_state.find('IG') == 0
                    label = 'internal'
                    color = 'green'

                label_to_use = None
                if color not in legend_colors:
                    label_to_use = label
                    legend_colors.add(color)

                # horizontal line at height total+prob
                ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use)

                # vertical line from total to total + prob
                ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width)

                midpoint = 0.5*(prob + 2*total)
                # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state))  # nicely labels the midpoint of the chunk between lines, but there isn't really room for it

                total += prob
    
            ibin += 1

        ax.get_xaxis().set_visible(False)
        plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
示例#5
0
    def add_new_allele(self, gene, fitfo, n_candidate_snps, debug=False):
        # figure out what the new nukes are
        old_seq = self.glfo['seqs'][utils.get_region(gene)][gene]
        new_seq = old_seq
        mutfo = {}
        for pos in sorted(fitfo['candidates'][n_candidate_snps]):
            obs_counts = {nuke : self.counts[gene][pos][n_candidate_snps][nuke] for nuke in utils.nukes}  # NOTE it's super important to only use the counts from sequences with <n_candidate_snps> total mutations
            sorted_obs_counts = sorted(obs_counts.items(), key=operator.itemgetter(1), reverse=True)
            original_nuke = self.mfreqer.counts[gene][pos]['gl_nuke']
            new_nuke = None
            for nuke, _ in sorted_obs_counts:  # take the most common one that isn't the existing gl nuke
                if nuke != original_nuke:
                    new_nuke = nuke
                    break
            print '   %3d  (%s --> %s)' % (pos, original_nuke, new_nuke),
            assert old_seq[pos] == original_nuke
            mutfo[pos] = {'original' : original_nuke, 'new' : new_nuke}
            new_seq = new_seq[:pos] + new_nuke + new_seq[pos+1:]

        new_name, mutfo = glutils.get_new_allele_name_and_change_mutfo(gene, mutfo)
        print ''
        print '          %s   %s' % (old_seq, utils.color_gene(gene))
        print '          %s   %s' % (utils.color_mutants(old_seq, new_seq), utils.color_gene(new_name))

        # and add it to the set of new alleles for this gene
        self.new_allele_info.append({
            'template-gene' : gene,
            'gene' : new_name,
            'seq' : new_seq,
            'aligned-seq' : None
        })
示例#6
0
def get_dbg_str(indelfo):
    if len(indelfo['qr_gap_seq']) != len(indelfo['gl_gap_seq']):
        print indelfo['qr_gap_seq']
        print indelfo['gl_gap_seq']
        raise Exception('different length qr and gl gap seqs (see previous lines)')
    qrprintstr, glprintstr = [], []
    for ich in range(len(indelfo['qr_gap_seq'])):
        qrb, glb = indelfo['qr_gap_seq'][ich], indelfo['gl_gap_seq'][ich]
        qrcolor, glcolor = None, None
        if qrb in utils.gap_chars or glb in utils.gap_chars:
            qrcolor = 'light_blue'
            glcolor = 'light_blue'
        elif qrb in utils.ambiguous_bases:
            qrcolor = 'light_blue'
        elif glb in utils.ambiguous_bases:
            glcolor = 'light_blue'
        elif qrb != glb:
            qrcolor = 'red'
        qrprintstr.append(utils.color(qrcolor, qrb if qrb not in utils.gap_chars else '*'))  # change it to a start just cause that's what it originally was... at some point should switch to just leaving it whatever gap char it was
        glprintstr.append(utils.color(glcolor, glb if glb not in utils.gap_chars else '*'))
    qrprintstr = ''.join(qrprintstr)
    glprintstr = ''.join(glprintstr)

    gene_str = ''
    gwidth = str(len('query'))
    if 'v' in indelfo['genes']:
        gene_str = utils.color_gene(indelfo['genes']['v'], width=int(gwidth), leftpad=True)
        gwidth = str(utils.len_excluding_colors(gene_str))
    dj_gene_str = ' '.join([utils.color_gene(indelfo['genes'][r]) for r in 'dj' if r in indelfo['genes']])
    dbg_str_list = [('  %' + gwidth + 's  %s  %s') % (gene_str, glprintstr, dj_gene_str),
                    ('  %' + gwidth + 's  %s') % ('query', qrprintstr)]
    for idl in indelfo['indels']:
        dbg_str_list.append('%10s: %d base%s at %d (%s)' % (idl['type'], idl['len'], utils.plural(idl['len']), idl['pos'], idl['seqstr']))
    return '\n'.join(dbg_str_list)
示例#7
0
    def reassign_template_counts(self, msa_info, new_alleles, debug=False):
        # XXX need to update family_groups here
        if len(new_alleles) == 0:
            return

        if debug:
            print '              template  new'
            print '      size      snps    snps    assigned',
            if self.reco_info is not None:
                print '         true',
            print ''

        dbg_print = debug  # don't print all the tiny clusters
        templates = {newfo['template-gene'] : newfo['gene'] for newfo in new_alleles.values()}
        self.adjusted_glcounts = {}
        for clusterfo in sorted(msa_info, key=lambda cfo: len(cfo['seqfos']), reverse=True):
            sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo)  # it would be nice to not re-call this for the clusters we already called it on above
            for gene, counts in sorted_glcounts:  # <gene> is the one assigned by sw before allele clustering
                if debug and len(clusterfo['seqfos']) < 5:
                    if dbg_print:
                        print '     not printing clusters smaller than 5'
                    dbg_print = False

                if gene not in self.adjusted_glcounts:  # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts
                    self.adjusted_glcounts[gene] = 0
                if gene in templates:  # if this was a template for a new allele, we have to decide whether to apportion some or all of the sequences in this cluster to that new allele
                    template_gene = gene
                    template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene)
                    cons_seq = clusterfo['cons_seq']
                    template_seq = self.glfo['seqs'][self.region][template_gene]
                    new_allele_seq = new_alleles[templates[template_gene]]['seq']

                    compare_len = min([template_cpos, len(cons_seq), len(template_seq), len(new_allele_seq)])  # NOTE this doesn't account for indels, i.e. the template and consensus sequences are in general different lengths, but that's ok, it'll just inflate the hamming distance for sequences that differ from consensus by indels, and all we care is finding the one that doesn't have any indels
                    n_template_snps = utils.hamming_distance(cons_seq[:compare_len], template_seq[:compare_len])
                    n_new_snps = utils.hamming_distance(cons_seq[:compare_len], new_allele_seq[:compare_len])

                    if debug and dbg_print:
                        print '    %5d      %3d     %3d' % (len(clusterfo['seqfos']), n_template_snps, n_new_snps),

                    if n_new_snps < n_template_snps:  # reassign to the new allele
                        gene = templates[template_gene]
                        if gene not in self.adjusted_glcounts:  # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts
                            self.adjusted_glcounts[gene] = 0

                    if debug and dbg_print:
                        print '    %s' % utils.color_gene(gene, width=15),
                        if self.reco_info is not None:
                            true_gene = true_sorted_glcounts[0][0]  # NOTE this is the most *common* simulated gene in the cluster, not necessarily the one corresponding to these particular sequences... but clusters with new alleles should generally be dominated by one gene, so oh, well
                            if true_gene == gene:
                                print '    %s' % utils.color('green', 'ok'),
                            else:
                                print '    %s' % utils.color_gene(true_gene, width=15),
                        print ''

                self.adjusted_glcounts[gene] += counts

        if debug:
            print '  final counts:'
            for gene, counts in sorted(self.adjusted_glcounts.items(), key=operator.itemgetter(1), reverse=True):
                print '    %4d  %s' % (counts, utils.color_gene(gene))
示例#8
0
    def make_transition_plot(self, gene_name, model):
        """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """
        fig, ax = plotting.mpl_init()
        fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

        ibin = 0
        print utils.color_gene(utils.unsanitize_name(gene_name))
        legend_colors = set()  # add a color to this the first time you plot it
        for state in model.states:

            # bin label
            ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8)

            sorted_to_states = {}
            for name in state.transitions.keys():
                if name.find('IG') == 0 or name.find('TR') == 0:
                    sorted_to_states[name] = int(paramutils.simplify_state_name(name))
                else:
                    sorted_to_states[name] = name
            sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1))

            total = 0.0
            for to_state, simple_to_state in sorted_to_states:

                prob = state.transitions[to_state]

                alpha = 0.6
                width = 3

                if 'insert' in str(simple_to_state):
                    label = 'insert'
                    color = '#3498db'  # blue
                elif str(simple_to_state) == 'end':
                    label = 'end'
                    color = 'red'
                else:  # regional/internal states
                    assert to_state.find('IG') == 0 or to_state.find('TR') == 0
                    label = 'internal'
                    color = 'green'

                label_to_use = None
                if color not in legend_colors:
                    label_to_use = label
                    legend_colors.add(color)

                # horizontal line at height total+prob
                ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use)

                # vertical line from total to total + prob
                ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width)

                midpoint = 0.5*(prob + 2*total)
                # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state))  # nicely labels the midpoint of the chunk between lines, but there isn't really room for it

                total += prob
    
            ibin += 1

        ax.get_xaxis().set_visible(False)
        plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
示例#9
0
    def finalize(self, sorted_gene_counts, debug=False):
        # NOTE <sorted_gene_counts> is usually/always floats instead of integers
        assert not self.finalized
        easycounts = {gene : counts for gene, counts in sorted_gene_counts}
        total_counts = sum([counts for counts in easycounts.values()])

        self.genes_to_keep = set()

        if debug:
            print '  removing least likely genes (%.1f total counts)' % total_counts
            print '     %-20s    %5s (%s)      removed genes (counts)' % ('genes to keep', 'counts', 'snps'),
            def count_str(cnt):
                if cnt < 10.:
                    return '%.1f' % cnt
                else:
                    return '%.0f' % cnt

        class_counts = self.separate_into_classes(sorted_gene_counts, easycounts)
        for iclass in range(len(class_counts)):
            gclass = class_counts[iclass]
            n_from_this_class = 0
            for ig in range(len(gclass)):
                gfo = gclass[ig]
                if self.args.n_max_total_alleles is not None and len(self.genes_to_keep) >= self.args.n_max_total_alleles:  # command line can specify the total number of alleles
                    break

                if float(gfo['counts']) / total_counts < self.args.min_allele_prevalence_fraction:  # always skip everybody that's super uncommon
                    pass
                elif ig == 0:  # keep the first one from this class
                    self.genes_to_keep.add(gfo['gene'])
                    n_from_this_class += 1
                elif utils.hamming_distance(gclass[0]['seq'], gclass[ig]['seq']) == 0:  # don't keep it if it's indistinguishable from the most common one (the matches are probably mostly really the best one)
                    pass  # don't keep it
                elif n_from_this_class < self.args.n_alleles_per_gene:  # always keep the most common <self.args.n_alleles_per_gene> in each class
                    self.genes_to_keep.add(gfo['gene'])
                    n_from_this_class += 1
                else:
                    pass  # don't keep it

                if debug and gfo['gene'] in self.genes_to_keep:
                    snpstr = ' ' if ig == 0 else '(%d)' % utils.hamming_distance(gclass[0]['seq'], gfo['seq'])
                    print '\n       %-s  %7s  %-3s' % (utils.color_gene(gfo['gene'], width=20), count_str(gfo['counts']), snpstr),
            if debug:
                if n_from_this_class == 0:
                    print '\n       %-s  %7s  %-3s' % (utils.color('blue', 'none', width=20, padside='right'), '-', ''),
                removedfo = [gfo for gfo in gclass if gfo['gene'] not in self.genes_to_keep]
                if len(removedfo) > 0:
                    removal_strs = ['%s (%s)' % (utils.color_gene(gfo['gene']), count_str(gfo['counts'])) for gfo in removedfo]
                    print '        %s' % '  '.join(removal_strs),
        if debug:
            print ''

        self.genes_to_remove = set(self.glfo['seqs'][self.region]) - self.genes_to_keep

        print '    keeping %d / %d %s gene%s' % (len(self.genes_to_keep), len(self.glfo['seqs'][self.region]), self.region, utils.plural(len(self.genes_to_keep)))
        # print '    removing %d %s genes: %d with no matches, %d with unconvincing matches' % (len(self.genes_to_remove), self.region, len(set(self.glfo['seqs'][self.region]) - set(easycounts)), len(set(easycounts) - self.genes_to_keep))

        self.finalized = True
示例#10
0
def remove_gene(glfo, gene, debug=False):
    """ remove <gene> from <glfo> """
    region = utils.get_region(gene)
    if gene in glfo["seqs"][region]:
        if debug:
            print "  removing %s from glfo" % utils.color_gene(gene)
        del glfo["seqs"][region][gene]
        if region in utils.conserved_codons[glfo["chain"]]:
            del glfo[utils.conserved_codons[glfo["chain"]][region] + "-positions"][gene]
    else:
        if debug:
            print "  can't remove %s from glfo, it's not there" % utils.color_gene(gene)
示例#11
0
def remove_gene(glfo, gene, debug=False):
    """ remove <gene> from <glfo> """
    region = utils.get_region(gene)
    if gene in glfo['seqs'][region]:
        if debug:
            print '  removing %s from glfo' % utils.color_gene(gene)
        del glfo['seqs'][region][gene]
        if region in utils.conserved_codons[glfo['locus']]:
            del glfo[utils.conserved_codons[glfo['locus']][region] + '-positions'][gene]
    else:
        if debug:
            print '  can\'t remove %s from glfo, it\'s not there' % utils.color_gene(gene)
示例#12
0
    def find_partial_failures(self, fostream_name):
        unique_ids = []
        for line in open(fostream_name.replace('.fostream', '')).readlines():
            if len(self.sim_need) == 0:
                return
            if len(line.strip()) == 0:  # skip blank lines
                continue

            line = line.replace('"', '')
            line = line.split(';')

            unique_id = line[0]

            if 'NA' not in line:  # skip lines that were ok
                unique_ids.append(unique_id)
                continue
            if unique_id not in self.sim_need:
                continue
            if unique_id not in self.siminfo:
                continue  # not looking for this <unique_id> a.t.m.

            info = {}
            info['unique_id'] = unique_id
            for stuff in line:
                for region in utils.regions:  # add the first instance of IGH[VDJ] (if it's there at all)
                    if 'IGH' + region.upper(
                    ) in stuff and region + '_gene' not in info:
                        genes = re.findall(
                            'IGH' + region.upper() + '[^ ][^ ]*', stuff)
                        if len(genes) == 0:
                            print 'ERROR no %s genes in %s' % (region, stuff)
                        gene = genes[0]
                        if gene not in self.germline_seqs[region]:
                            print 'ERROR bad gene %s for %s' % (gene,
                                                                unique_id)
                            sys.exit()
                        info[region + '_gene'] = gene
            self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
            if self.args.debug:
                print '%-20s  partial fail %s %s %s' % (
                    unique_id, utils.color_gene(info['v_gene']) if 'v_gene'
                    in info else '', utils.color_gene(info['d_gene']) if
                    'd_gene' in info else '', utils.color_gene(info['j_gene'])
                    if 'j_gene' in info else ''),
                print '  (true %s %s %s)' % tuple([
                    self.siminfo[unique_id][region + '_gene']
                    for region in utils.regions
                ])
            self.failtails[unique_id] = info
            self.n_partially_failed += 1
            self.sim_need.remove(unique_id)

        return unique_ids
示例#13
0
    def find_partial_failures(self, fostream_name):
        unique_ids = []
        for line in open(fostream_name.replace(".fostream", "")).readlines():
            if len(self.sim_need) == 0:
                return
            if len(line.strip()) == 0:  # skip blank lines
                continue

            line = line.replace('"', "")
            line = line.split(";")

            unique_id = line[0]

            if "NA" not in line:  # skip lines that were ok
                unique_ids.append(unique_id)
                continue
            if unique_id not in self.sim_need:
                continue
            if unique_id not in self.siminfo:
                continue  # not looking for this <unique_id> a.t.m.

            info = {}
            info["unique_id"] = unique_id
            for stuff in line:
                for region in utils.regions:  # add the first instance of IGH[VDJ] (if it's there at all)
                    if "IGH" + region.upper() in stuff and region + "_gene" not in info:
                        genes = re.findall("IGH" + region.upper() + "[^ ][^ ]*", stuff)
                        if len(genes) == 0:
                            print "ERROR no %s genes in %s" % (region, stuff)
                        gene = genes[0]
                        if gene not in self.germline_seqs[region]:
                            print "ERROR bad gene %s for %s" % (gene, unique_id)
                            sys.exit()
                        info[region + "_gene"] = gene
            self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
            if self.args.debug:
                print "%-20s  partial fail %s %s %s" % (
                    unique_id,
                    utils.color_gene(info["v_gene"]) if "v_gene" in info else "",
                    utils.color_gene(info["d_gene"]) if "d_gene" in info else "",
                    utils.color_gene(info["j_gene"]) if "j_gene" in info else "",
                ),
                print "  (true %s %s %s)" % tuple(
                    [self.siminfo[unique_id][region + "_gene"] for region in utils.regions]
                )
            self.failtails[unique_id] = info
            self.n_partially_failed += 1
            self.sim_need.remove(unique_id)

        return unique_ids
示例#14
0
    def write_hmm_input(self, csv_fname, sw_info, parameter_dir, preclusters=None, hmm_type='', pair_hmm=False, stripped=False):
        print '    writing input'
        csvfile = opener('w')(csv_fname)
        start = time.time()

        # write header
        header = ['names', 'k_v_min', 'k_v_max', 'k_d_min', 'k_d_max', 'only_genes', 'seqs']  # I wish I had a good c++ csv reader 
        csvfile.write(' '.join(header) + '\n')

        skipped_gene_matches = set()
        assert hmm_type != ''
        if hmm_type == 'k=1':  # single vanilla hmm
            nsets = [[qn] for qn in self.input_info.keys()]
        elif hmm_type == 'k=2':  # pair hmm
            nsets = self.get_pairs(preclusters)
        elif hmm_type == 'k=preclusters':  # run the k-hmm on each cluster in <preclusters>
            assert preclusters != None
            nsets = [ val for key, val in preclusters.id_clusters.items() if len(val) > 1 ]  # <nsets> is a list of sets (well, lists) of query names
            # nsets = []
            # for cluster in preclusters.id_clusters.values():
            #     nsets += itertools.combinations(cluster, 5)
        elif hmm_type == 'k=nsets':  # run on *every* combination of queries which has length <self.args.n_sets>
            if self.args.all_combinations:
                nsets = itertools.combinations(self.input_info.keys(), self.args.n_sets)
            else:  # put the first n together, and the second group of n (not the self.input_info is and OrderedDict)
                nsets = []
                keylist = self.input_info.keys()
                this_set = []
                for iquery in range(len(keylist)):
                    if iquery % self.args.n_sets == 0:  # every nth query, start a new group
                        if len(this_set) > 0:
                            nsets.append(this_set)
                        this_set = []
                    this_set.append(keylist[iquery])
                if len(this_set) > 0:
                    nsets.append(this_set)
        else:
            assert False

        for query_names in nsets:
            non_failed_names = self.remove_sw_failures(query_names, sw_info)
            if len(non_failed_names) == 0:
                continue
            combined_query = self.combine_queries(sw_info, non_failed_names, parameter_dir, stripped=stripped, skipped_gene_matches=skipped_gene_matches)
            if len(combined_query) == 0:  # didn't find all regions
                continue
            csvfile.write('%s %d %d %d %d %s %s\n' %  # NOTE csv.DictWriter can handle tsvs, so this should really be switched to use that
                          (':'.join([str(qn) for qn in non_failed_names]),
                           combined_query['k_v']['min'], combined_query['k_v']['max'],
                           combined_query['k_d']['min'], combined_query['k_d']['max'],
                           ':'.join(combined_query['only_genes']),
                           ':'.join(combined_query['seqs'])))

        if len(skipped_gene_matches) > 0:
            print '    not found in %s, i.e. were never the best sw match for any query, so removing from consideration for hmm:' % (parameter_dir)
            for region in utils.regions:
                print '      %s: %s' % (region, ' '.join([utils.color_gene(gene) for gene in skipped_gene_matches if utils.get_region(gene) == region]))

        csvfile.close()
        print '        input write time: %.3f' % (time.time()-start)
示例#15
0
 def skip_gene(gene):
     if self.args.debug:
         print '    %s in list of genes to skip' % utils.color_gene(gene)
     if gene not in genes_actually_skipped:
         genes_actually_skipped[gene] = 0
     genes_actually_skipped[gene] += 1
     qr_info['skip_gene'] = True
示例#16
0
 def sim_gene_count_str(
     kgene
 ):  # figure out simulation genes and counts for the uids assigned to <kgene>
     if annotations is None or self.reco_info is None:
         return ''
     uids_this_gene = [
         uid for uid, line in annotations.items()
         if line[region + '_gene'] == kgene
     ]
     sim_genes = {
     }  # simulation genes for the uids that we assigned to <kgene> (note that self.simcounts doesn't have this per-uid information)
     for uid in uids_this_gene:
         sgene = self.reco_info[uid][region + '_gene']
         if sgene not in sim_genes:
             sim_genes[sgene] = 0
         sim_genes[sgene] += 1
     sorted_sim_gene_counts = sorted(sim_genes.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)
     count_str = ' '.join([
         utils.color('blue' if sg == kgene else 'red', str(c))
         for sg, c in sorted_sim_gene_counts
     ])
     sgene_str = ' '.join(
         [utils.color_gene(sg) for sg, _ in sorted_sim_gene_counts])
     return '%s   %s' % (count_str, sgene_str)
示例#17
0
文件: glutils.py 项目: Annak17/partis
def add_new_allele(glfo, newfo, remove_template_genes, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'template-gene' : 'IGHV3-71*01', 'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo['template-gene']
    region = utils.get_region(template_gene)
    if template_gene not in glfo['seqs'][region]:
        raise Exception('unknown template gene %s' % template_gene)

    new_gene = newfo['gene']

    if region == 'v':
        glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene]
    elif region == 'j':
        glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene]

    glfo['seqs'][region][new_gene] = newfo['seq']

    if debug:
        print '    adding new allele to glfo:'
        print '      template %s   %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene))
        print '           new %s   %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene))

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
 def skip_gene(gene):
     print '    %s in list of genes to skip' % utils.color_gene(
         gene)
     if gene not in genes_actually_skipped:
         genes_actually_skipped[gene] = 0
     genes_actually_skipped[gene] += 1
     line['skip_gene'] = True
示例#19
0
def choose_some_alleles(region, genes_to_use, allelic_groups, n_alleles_per_gene, debug=False):
    """ choose a gene (i.e. a primary and sub-version) from <allelic_groups>, and its attendant alleles """
    # NOTE also modifies <allelic_groups>

    if len(allelic_groups[region]) == 0:
        raise Exception('ran out of %s alleles (either --n-genes-per-region or --n-alleles-per-gene are probably too big)' % region)

    available_versions = None
    while available_versions is None or len(available_versions) == 0:
        if available_versions is not None:
            print '  %s couldn\'t find any versions that have %d alleles, so trying again' % (utils.color('red', 'warning'), n_alleles)
        n_alleles = numpy.random.choice(n_alleles_per_gene[region])
        available_versions = [(pv, subv) for pv in allelic_groups[region] for subv in allelic_groups[region][pv] if len(allelic_groups[region][pv][subv]) >= n_alleles]
    ichoice = numpy.random.randint(0, len(available_versions) - 1) if len(available_versions) > 1 else 0  # numpy.random.choice() can't handle list of tuples (and barfs if you give it only one thing to choose from)
    primary_version, sub_version = available_versions[ichoice]
    new_alleles = set(numpy.random.choice(list(allelic_groups[region][primary_version][sub_version]), size=n_alleles, replace=False))
    if debug:
        print '      %8s %5s   %s' % (primary_version, sub_version, ' '.join([utils.color_gene(g, width=15) for g in new_alleles]))

    assert len(new_alleles & genes_to_use) == 0  # make sure none of the new alleles are already in <genes_to_use>
    genes_to_use |= new_alleles  # actually add them to the final set

    # remove stuff we've used from <allelic_groups>
    del allelic_groups[region][primary_version][sub_version]  # remove this sub-version (we don't want any more alleles from it)
    if len(allelic_groups[region][primary_version]) == 0:
        del allelic_groups[region][primary_version]
示例#20
0
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo['template-gene']
    region = utils.get_region(template_gene)
    if template_gene not in glfo['seqs'][region]:
        raise Exception('unknown template gene %s' % template_gene)

    new_gene = newfo['gene']

    if region == 'v':
        glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene]
    elif region == 'j':
        glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene]

    glfo['seqs'][region][new_gene] = newfo['seq']

    if debug:
        print '    adding new allele to glfo:'
        print '      template %s   %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene))
        print '           new %s   %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene))

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
示例#21
0
def trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=False):
    nearest_template_gene = glutils.find_nearest_gene_using_names(
        template_glfo, gene)
    nearest_template_seq = template_glfo['seqs'][region][nearest_template_gene]
    # extra_bases = glfo['cyst-positions'][gene] - template_glfo['cyst-positions'][nearest_template_gene]  # not right if there's some internal gaps in the alignment
    aligned_nearest_template_seq, aligned_seq = utils.align_seqs(
        nearest_template_seq, seq)

    if debug:
        print '    %s' % utils.color_gene(gene)
        utils.color_mutants(aligned_nearest_template_seq,
                            aligned_seq,
                            print_result=True,
                            ref_label='template ',
                            extra_str='       ')

    if aligned_seq[0] not in utils.gap_chars and aligned_nearest_template_seq[
            0] not in utils.gap_chars:
        if debug:
            print '      ok'
    elif aligned_seq[0] in utils.gap_chars:
        if debug:
            print '      %s, removing' % utils.color('red', 'too small')
        glutils.remove_gene(glfo, gene)
    else:
        if debug:
            print '        extra bases %s' % utils.color_gene(gene)
        extra_bases = len(aligned_nearest_template_seq) - len(
            aligned_nearest_template_seq.lstrip('-'))
        seq = seq[extra_bases:]
        if debug:
            print '          removed %d bases' % extra_bases
        if seq in glfo['seqs'][region].values():
            print '    trimmed seq already in glfo under name %s, so removing it' % ' '.join(
                [
                    utils.color_gene(g)
                    for g, s in glfo['seqs'][region].items() if s == seq
                ])
            glutils.remove_gene(glfo, gene, debug=True)
            return
        glfo['seqs'][region][gene] = seq
        glfo['cyst-positions'][gene] -= extra_bases
        # utils.color_mutants(nearest_template_seq, seq, print_result=True, ref_label='template ', align=True, extra_str='            ')
        assert utils.codon_unmutated('cyst',
                                     glfo['seqs'][region][gene],
                                     glfo['cyst-positions'][gene],
                                     debug=True)
示例#22
0
 def print_match(self,
                 region,
                 gene,
                 query_seq,
                 score,
                 glbounds,
                 qrbounds,
                 codon_pos,
                 warnings,
                 skipping=False):
     if self.debug < 2:
         return
     out_str_list = []
     buff_str = (20 - len(gene)) * ' '
     tmp_val = score
     if self.args.apply_choice_probs_in_sw and self.get_choice_prob(
             region, gene) != 0.0:
         tmp_val = score / self.get_choice_prob(region, gene)
     if self.args.apply_choice_probs_in_sw:
         out_str_list.append(
             '%8s%s%s%9.1e * %3.0f = %-6.1f' %
             (' ', utils.color_gene(gene), buff_str,
              self.get_choice_prob(region, gene), tmp_val, score))
     else:
         out_str_list.append(
             '%8s%s%s%9s%3s %6.0f        ' %
             (' ', utils.color_gene(gene), '', '', buff_str, score))
     out_str_list.append(
         '%4d%4d   %s\n' %
         (glbounds[0], glbounds[1],
          self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]))
     out_str_list.append('%46s  %4d%4d' % ('', qrbounds[0], qrbounds[1]))
     out_str_list.append('   %s ' % (utils.color_mutants(
         self.germline_seqs[region][gene][glbounds[0]:glbounds[1]],
         query_seq[qrbounds[0]:qrbounds[1]])))
     if region != 'd':
         out_str_list.append(
             '(%s %d)' % (utils.conserved_codon_names[region], codon_pos))
     if warnings[gene] != '':
         out_str_list.append('WARNING ' + warnings[gene])
     if skipping:
         out_str_list.append('skipping!')
     if self.args.outfname is None:
         print ''.join(out_str_list)
     else:
         out_str_list.append('\n')
         self.outfile.write(''.join(out_str_list))
示例#23
0
def parse_ramesh_seqs(glseqs, outdir, debug=False):
    for locus in glseqs:
        glutils.remove_glfo_files(outdir, locus)
        # write to a glfo dir without extra info
        for region in glseqs[locus]:
            fn = glutils.get_fname(outdir, locus, region)
            if not os.path.exists(os.path.dirname(fn)):
                os.makedirs(os.path.dirname(fn))
            with open(fn, 'w') as ofile:
                for gene, seq in glseqs[locus][region].items():
                    ofile.write('>%s\n%s\n' % (gene, seq))

        # figure out extra info
        template_glfo = glutils.read_glfo('data/germlines/macaque', locus)
        glfo = glutils.read_glfo(outdir,
                                 locus,
                                 template_glfo=template_glfo,
                                 remove_bad_genes=True,
                                 debug=True)

        # trim non-coding stuff upstream of v (and remove non-full-length ones)
        gene_groups = {}
        for region in ['v']:
            group_labels = sorted(
                set([utils.gene_family(g) for g in glfo['seqs'][region]]))
            gene_groups[region] = [(glabel, {
                g: glfo['seqs'][region][g]
                for g in glfo['seqs'][region] if utils.gene_family(g) == glabel
            }) for glabel in group_labels]
        for region in [r for r in utils.regions if r in gene_groups]:
            if debug:
                print '%s' % utils.color('reverse_video',
                                         utils.color('green', region))
            for group_label, group_seqs in gene_groups[
                    region]:  # ok, this isn't really doing anything any more
                if debug:
                    print '  %s' % utils.color('blue', group_label)
                for gene, seq in group_seqs.items():
                    trim_and_remove_genes(region,
                                          gene,
                                          seq,
                                          glfo,
                                          template_glfo,
                                          debug=debug)

        # remove any seqs with ambiguous bases
        for region in [r for r in utils.regions if r in glfo['seqs']]:
            for gene, seq in glfo['seqs'][region].items():
                if utils.ambig_frac(seq) > 0.:
                    if debug:
                        print '   %d ambiguous bases: %s' % (
                            len(seq) * utils.ambig_frac(seq),
                            utils.color_gene(gene))
                    glutils.remove_gene(glfo, gene)

        # glutils.print_glfo(glfo)

        # write final result
        glutils.write_glfo(outdir, glfo, debug=True)
示例#24
0
    def finalize(self, debug=False):
        assert not self.finalized

        self.mfreqer.finalize()

        start = time.time()
        gene_results = {'not_enough_obs_to_fit' : set(), 'didnt_find_anything_with_fit' : set(), 'new_allele' : set()}
        if debug:
            print '\nlooking for new alleles:'
        for gene in sorted(self.mfreqer.counts):
            if utils.get_region(gene) != 'v':
                continue
            if debug:
                print '\n%s (observed %d %s)' % (utils.color_gene(gene), self.gene_obs_counts[gene], utils.plural_str('time', self.gene_obs_counts[gene]))

            positions_to_try_to_fit, xyvals = self.get_positions_to_fit(gene, gene_results, debug=debug)
            if positions_to_try_to_fit is None:
                continue

            fitfo = {n : {} for n in ('min_snp_ratios', 'candidates')}
            for istart in range(1, self.n_max_snps):
                if debug:
                    if istart == 1:
                        print '                                 resid. / ndof'
                        print '             position   ratio   (m=0 / m>%5.2f)       muted / obs ' % self.big_y_icpt_bounds[0]
                    print '  %d %s' % (istart, utils.plural_str('snp', istart))

                subxyvals = {pos : {k : v[istart : istart + self.max_fit_length] for k, v in xyvals[pos].items()} for pos in positions_to_try_to_fit}
                self.fit_istart(gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=debug)
                if istart not in fitfo['candidates']:  # if it didn't get filled, we didn't have enough observations to do the fit
                    break

            istart_candidates = []
            if debug:
                print '  evaluating each snp hypothesis'
                print '    snps       min ratio'
            for istart in fitfo['candidates']:
                if debug:
                    print '    %2d     %9s' % (istart, fstr(fitfo['min_snp_ratios'][istart])),
                if self.is_a_candidate(gene, fitfo, istart, debug=debug):
                    istart_candidates.append(istart)

            if len(istart_candidates) > 0:
                n_candidate_snps = min(istart_candidates)  # add the candidate with the smallest number of snps to the germline set, and run again (if the firs
                gene_results['new_allele'].add(gene)
                print '\n    found a new allele candidate separated from %s by %d %s at %s:' % (utils.color_gene(gene), n_candidate_snps,
                                                                                                utils.plural_str('snp', n_candidate_snps), utils.plural_str('position', n_candidate_snps)),
                self.add_new_allele(gene, fitfo, n_candidate_snps, debug=debug)
            else:
                gene_results['didnt_find_anything_with_fit'].add(gene)
                if debug:
                    print '  no new alleles'

        if debug:
            print 'found new alleles for %d %s (there were also %d without new alleles, and %d without enough observations to fit)' % (len(gene_results['new_allele']), utils.plural_str('gene', len(gene_results['new_allele'])),
                                                                                                                                       len(gene_results['didnt_find_anything_with_fit']), len(gene_results['not_enough_obs_to_fit']))
            print '      allele finding time: %.1f' % (time.time()-start)

        self.finalized = True
示例#25
0
    def find_partial_failures(self, fostream_name):
        unique_ids = []
        for line in open(fostream_name.replace('.fostream', '')).readlines():
            if len(self.sim_need) == 0:
                return
            if len(line.strip()) == 0:  # skip blank lines
                continue

            line = line.replace('"', '')
            line = line.split(';')

            unique_id = line[0]
            
            if 'NA' not in line:  # skip lines that were ok
                unique_ids.append(unique_id)
                continue
            if unique_id not in self.sim_need:
                continue
            if unique_id not in self.siminfo:
                continue  # not looking for this <unique_id> a.t.m.

            info = {}
            info['unique_id'] = unique_id
            for stuff in line:
                for region in utils.regions:  # add the first instance of IGH[VDJ] (if it's there at all)
                    if 'IGH'+region.upper() in stuff and region+'_gene' not in info:
                        genes = re.findall('IGH' + region.upper() + '[^ ][^ ]*', stuff)
                        if len(genes) == 0:
                            print 'ERROR no %s genes in %s' % (region, stuff)
                        gene = genes[0]
                        if gene not in self.germline_seqs[region]:
                            print 'ERROR bad gene %s for %s' % (gene, unique_id)
                            sys.exit()
                        info[region + '_gene'] = gene
            self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
            if self.args.debug:
                print '%-20s  partial fail %s %s %s' % (unique_id,
                                                     utils.color_gene(info['v_gene']) if 'v_gene' in info else '',
                                                     utils.color_gene(info['d_gene']) if 'd_gene' in info else '',
                                                     utils.color_gene(info['j_gene']) if 'j_gene' in info else ''),
                print '  (true %s %s %s)' % tuple([self.siminfo[unique_id][region + '_gene'] for region in utils.regions])
            self.failtails[unique_id] = info
            self.n_partially_failed += 1
            self.sim_need.remove(unique_id)

        return unique_ids
 def skip_gene(gene):
     if self.args.debug:
         print '    %s in list of genes to skip' % utils.color_gene(
             gene)
     if gene not in genes_actually_skipped:
         genes_actually_skipped[gene] = 0
     genes_actually_skipped[gene] += 1
     qr_info['skip_gene'] = True
示例#27
0
文件: glutils.py 项目: Annak17/partis
def remove_gene(glfo, gene, debug=False):
    """ remove <gene> from <glfo> """
    if debug:
        print '  removing %s from glfo' % utils.color_gene(gene)
    region = utils.get_region(gene)
    if region in utils.conserved_codons:
        del glfo[utils.conserved_codons[region] + '-positions'][gene]
    del glfo['seqs'][region][gene]
示例#28
0
 def set_per_gene_support(self, true_line, inf_line, region):
     if inf_line[region + '_per_gene_support'].keys()[0] != inf_line[region + '_gene']:
         print '   WARNING best-supported gene %s not same as viterbi gene %s' % (utils.color_gene(inf_line[region + '_per_gene_support'].keys()[0]), utils.color_gene(inf_line[region + '_gene']))
     support = inf_line[region + '_per_gene_support'].values()[0]  # sorted, ordered dict with gene : logprob key-val pairs
     if true_line[region + '_gene'] == inf_line[region + '_gene']:  # NOTE this requires allele to be correct, but set_bool_column() does not
         self.hists[region + '_allele_right_vs_per_gene_support'].fill(support)
     else:
         self.hists[region + '_allele_wrong_vs_per_gene_support'].fill(support)
示例#29
0
def get_single_performance(region, outdir, method, debug=False):
    sglfo = glutils.read_glfo(outdir + '/germlines/simulation', locus=args.locus)
    iglfo = glutils.read_glfo(outdir + '/' + method + '/sw/germline-sets', locus=args.locus)
    glutils.synchronize_glfos(ref_glfo=sglfo, new_glfo=iglfo, region=region)
    missing_alleles = set(sglfo['seqs'][region]) - set(iglfo['seqs'][region])
    spurious_alleles = set(iglfo['seqs'][region]) - set(sglfo['seqs'][region])
    if debug:
        if len(missing_alleles) > 0:
            print '    %2d  missing %s' % (len(missing_alleles), ' '.join([utils.color_gene(g) for g in missing_alleles]))
        if len(spurious_alleles) > 0:
            print '    %2d spurious %s' % (len(spurious_alleles), ' '.join([utils.color_gene(g) for g in spurious_alleles]))
        if len(missing_alleles) == 0 and len(spurious_alleles) == 0:
            print '    none missing'
    return {
        'missing' : len(missing_alleles),
        'spurious' : len(spurious_alleles),
        'total' : len([g for g in sglfo['seqs'][region] if '+' in g]),  # anybody with a '+' should be a new allele
    }
示例#30
0
 def set_per_gene_support(self, true_line, inf_line, region):
     if inf_line[region +
                 '_per_gene_support'].keys()[0] != inf_line[region +
                                                            '_gene']:
         print '   WARNING best-supported gene %s not same as viterbi gene %s' % (
             utils.color_gene(
                 inf_line[region + '_per_gene_support'].keys()[0]),
             utils.color_gene(inf_line[region + '_gene']))
     support = inf_line[region + '_per_gene_support'].values()[
         0]  # sorted, ordered dict with gene : logprob key-val pairs
     if true_line[region + '_gene'] == inf_line[
             region +
             '_gene']:  # NOTE this requires allele to be correct, but set_bool_column() does not
         self.hists[region +
                    '_allele_right_vs_per_gene_support'].fill(support)
     else:
         self.hists[region +
                    '_allele_wrong_vs_per_gene_support'].fill(support)
示例#31
0
def print_results(gl_sets):
    tmpfo = {
        'missing': set(gl_sets['sim']) - set(gl_sets['inf']),
        'spurious': set(gl_sets['inf']) - set(gl_sets['sim']),
        'ok': set(gl_sets['inf']) & set(gl_sets['sim'])
    }
    for name, genes in tmpfo.items():
        print '    %9s %2d: %s' % (name, len(genes), ' '.join(
            [utils.color_gene(g) for g in genes]))
示例#32
0
def build_v_gene_set(glfo, introns):
    total_d_counts = {}
    refseqs = {}
    for d_gene, counts in introns.items():
        total_d_counts[d_gene] = sum(counts.values())
    for d_gene, _ in sorted(total_d_counts.items(), key=operator.itemgetter(1), reverse=True):
        counts = introns[d_gene]

        # first decide on the reference sequences
        refseq, column_counts = None, None
        for seq in sorted(counts, key=len, reverse=True):
            if refseq is None:  # first one, i.e. the longest
                refseq = seq
                column_counts = [{n : 0 for n in utils.nukes} for i in range(len(refseq))]
            ioffset = len(refseq) - len(seq)
            partial_refseq = refseq[ioffset:]
            assert len(partial_refseq) == len(seq)
            for ibase in range(ioffset, len(refseq)):
                column_counts[ibase][seq[ibase - ioffset]] += counts[seq]

        refseqs[d_gene] = []
        for basecounts in column_counts:
            most_common_base = sorted(basecounts.items(), key=operator.itemgetter(1), reverse=True)[0][0]
            refseqs[d_gene].append(most_common_base)
        refseqs[d_gene] = ''.join(refseqs[d_gene])

        n_ok = 0
        mutecounts = {}
        for seq in sorted(counts, key=len, reverse=True):
            # print '    %3d   %150s' % (count, seq)
            partial_refseq = refseqs[d_gene][len(refseqs[d_gene]) - len(seq):]
            if seq == partial_refseq:
                n_ok += counts[seq]
            else:
                # utils.color_mutants(partial_refseq, seq, print_result=True, extra_str='                ')
                n_mutes = utils.hamming_distance(partial_refseq, seq)
                if n_mutes not in mutecounts:
                    mutecounts[n_mutes] = 0
                mutecounts[n_mutes] += counts[seq]
        print '  %s   %4d / %-4d ok' % (utils.color_gene(d_gene, width=10), n_ok, n_ok + sum(mutecounts.values())),
        if len(mutecounts) > 0:
            print '(mean of %.1f mutations among the other %d' % (numpy.average(mutecounts.keys(), weights=mutecounts.values()), sum(mutecounts.values())),
        print ''

    # add the intronic v genes to glfo
    for d_gene, refseq in refseqs.items():
        glfo['seqs']['v'][utils.generate_dummy_v(d_gene)] = refseq
        glfo['cyst-positions'][utils.generate_dummy_v(d_gene)] = len(refseq) - 3

    # write a glfo dir with everything
    glutils.write_glfo(outdir + '/germlines/imgt-and-intronic', glfo, debug=True)

    # remove the original v genes, and write a glfo dir with just the intronic ones
    glutils.remove_genes(glfo, [g for g in glfo['seqs']['v'] if 'xDx' not in g], debug=True)
    glutils.write_glfo(outdir + '/germlines/intronic', glfo, debug=True)
示例#33
0
def make_mutefreq_plot(plotdir, gene_name, positions):
    import plotting
    """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """
    nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'}
    fig, ax = plotting.mpl_init()
    fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

    ibin = 0
    print utils.color_gene(utils.unsanitize_name(gene_name))
    legend_colors = set()
    for info in positions:
        posname = info['name']

        # make label below bin
        ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8)

        total = 0.0
        alpha = 0.6
        for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True):
            color = nuke_colors[nuke]

            label_to_use = None
            if color not in legend_colors:
                label_to_use = nuke
                legend_colors.add(color)

            # horizontal line at height total+prob
            ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use)

            # vertical line from total to total + prob
            ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3)

            # # write [ACGT] at midpoint between total and total+prob
            # midpoint = 0.5*(prob + 2*total)
            # ... *redacted*

            total += prob

        ibin += 1

    ax.get_xaxis().set_visible(False)
    plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
示例#34
0
 def getvalstr(gene, val):
     if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))):
         return '%s  %5.2s  %s  %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '')
     else:
         if latex:
             gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5)
             if emph_genes is not None and gene in emph_genes:
                 gstr = '\\color{red}{\\textbf{%s}}' % gstr
         else:
             gstr = utils.color_gene(gene, width=18)
         return '%s  %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
示例#35
0
def write_inf_glfo(
    args
):  # read default glfo, restrict it to the specified alleles, and write to somewhere where all the methods can read it
    # NOTE this dir should *not* be modified by any of the methods
    inf_glfo = glutils.read_glfo('data/germlines/human',
                                 locus=args.locus,
                                 only_genes=args.inf_v_genes + args.dj_genes)
    print '  writing initial inference glfo with %d v: %s' % (len(
        inf_glfo['seqs']['v']), ' '.join(
            [utils.color_gene(g) for g in inf_glfo['seqs']['v']]))
    glutils.write_glfo(args.inf_glfo_dir, inf_glfo)
示例#36
0
def print_data_pair_results(gl_sets):
    assert len(gl_sets) == 2  # would need to update
    ds_1, ds_2 = gl_sets.keys()
    tmpfo = {
        ds_1: set(gl_sets[ds_1]) - set(gl_sets[ds_2]),
        ds_2: set(gl_sets[ds_2]) - set(gl_sets[ds_1]),
        'both': set(gl_sets[ds_2]) & set(gl_sets[ds_1])
    }
    for name, genes in tmpfo.items():
        print '    %9s %2d: %s' % (name, len(genes), ' '.join(
            [utils.color_gene(g) for g in genes]))
示例#37
0
def make_mutefreq_plot(plotdir, gene_name, positions):
    """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """
    nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'}
    fig, ax = plotting.mpl_init()
    fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

    ibin = 0
    print utils.color_gene(utils.unsanitize_name(gene_name))
    legend_colors = set()
    for info in positions:
        posname = info['name']

        # make label below bin
        ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8)

        total = 0.0
        alpha = 0.6
        for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True):
            color = nuke_colors[nuke]

            label_to_use = None
            if color not in legend_colors:
                label_to_use = nuke
                legend_colors.add(color)

            # horizontal line at height total+prob
            ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use)

            # vertical line from total to total + prob
            ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3)

            # # write [ACGT] at midpoint between total and total+prob
            # midpoint = 0.5*(prob + 2*total)
            # ... *redacted*

            total += prob

        ibin += 1

    ax.get_xaxis().set_visible(False)
    plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
示例#38
0
 def print_gene_choice(self):
     print '    chose:  gene             length'
     for region in utils.regions:
         print '        %s  %-18s %-3d' % (
             region, utils.color_gene(self.genes[region], width=18),
             len(self.original_seqs[region])),
         if region in self.pre_erosion_codon_positions:
             print ' (%s: %d)' % (
                 utils.conserved_codons[self.glfo['locus']][region],
                 self.pre_erosion_codon_positions[region])
         else:
             print ''
示例#39
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, locus=self.args.locus, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
示例#40
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
示例#41
0
def print_stuff(line):
    cluster_index = sorted_clusters.index(cluster)
    naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(line, iseq=0, restrict_to_region='cdr3') # returns the CDR3 nt sequence for naive, and the first mutated sequence (iseq0); CDR3 = first base of cysteine through last base of tryptophan

    # mature_cdr3_seqs = []  # trying to translate the consensus cdr3 so I can search these with my seed seqs
    # for iseq in range(len(line['unique_ids'])):
    #     naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3')
    #     mature_cdr3_seqs.append(mature_cdr3_seq)
    # mature_cdr3_seqs
    # translated_cdr3 = mature_cdr3_seqs.translate()

    cdr3_aa = '%-30s' % Seq(naive_cdr3).translate()
    # If a cluster contains one of our seed seqs, color this CDR3 red
    if any('-ig' in s for s in line['unique_ids']):
        cdr3_aa = utils.color('red', cdr3_aa, width=30)
    if args.cdr3 in cdr3_aa: # Only print clusters with naive CDR3 that matches our specified --cdr3 argument
        print 'index    genes                                        size    n muts    SHM     rep frac     CDR3                                FayWuH'
        print '                                                            mean  med                        len  seq'
        print '%4s     %s %s %s %5d %5d %5d %7.3f   %8.4f     %2d   %s %4.2f' % (
                cluster_index,
                utils.color_gene(line['v_gene'], width=15),
                utils.color_gene(line['d_gene'], width=15),
                utils.color_gene(line['j_gene'], width=10),
                len(line['unique_ids']),
                numpy.mean(line['n_mutations']),
                numpy.median(line['n_mutations']),
                numpy.mean(line['mut_freqs']),
                float(len(cluster)) / n_total,
                (line['cdr3_length']/3),
                cdr3_aa,
                utils.fay_wu_h(line, debug=False),
                )
        # print 'number of mutations per sequence in cluster', sorted(line['n_mutations'])
        print len(line['naive_seq']), 'length of naive seq'
        # utils.print_reco_event(utils.synthesize_single_seq_line(line, iseq=0))  # print ascii-art representation of the rearrangement event
        print 'unique_ids: ', getkey(line['unique_ids'])
        print
        print utils.print_reco_event(line)
示例#42
0
def read_mute_counts(
    indir,
    gene,
    locus,
    extra_genes=None,
    debug=False
):  # NOTE I'm adding the <extra_genes> arg in a hackish way because i need this to not crash in one specific instance (running bin/test-germline-inference.py) where the file for <gene> doesn't exist, but I don't remember/understand how this fcn and the following function work well enough to do this more sensibly
    # NOTE also that this new hack that allows a different gene's counts to be used might break something later on if the genes have different lengths? I have no idea
    # ----------------------------------------------------------------------------------------
    def read_single_file(gtmp):
        mfname = indir + '/mute-freqs/' + utils.sanitize_name(gtmp) + '.csv'
        if not os.path.exists(mfname):
            return None
        observed_counts = {}
        with open(mfname, 'r') as mutefile:
            reader = csv.DictReader(mutefile)
            for line in reader:
                pos = int(line['position'])
                assert pos not in observed_counts
                observed_counts[pos] = {
                    n: int(line[n + '_obs'])
                    for n in utils.nukes
                }
        if debug:
            print '    read %d per-base mute counts from %s' % (
                len(observed_counts), mfname)
        return observed_counts

    # ----------------------------------------------------------------------------------------
    if extra_genes is not None:  # I don't want to fix it cause it'd be kinda hard, and also I don't think it ever happens under normal circumstances -- it's only called with this arg from simulation, in which case you should always have parameters for the gene you're asking for
        print '%s Reading per-base mutation counts for genes (%s) in addition to the desired one (%s), which doesn\'t really make sense, since the counts will be wrong at the positions at which the genes differ.' % (
            utils.color('red', 'warning'), utils.color_genes(extra_genes),
            utils.color_gene(gene))
        print '   This should only happen if you\'re doing something weird, probably running simulation asking for genes for which you don\'t have parameters.'
        print '   If this is the case and you only care that it doesn\'t crash, and not that the mutation model is particularly accurate, this is fine.'
    if gene == glutils.dummy_d_genes[locus]:
        return {}

    if extra_genes is None:
        approved_genes = [gene]
    else:
        assert gene not in extra_genes
        approved_genes = [gene] + extra_genes

    for gtmp in approved_genes:
        observed_counts = read_single_file(gtmp)
        if observed_counts is not None:  # HACK this just uses the first one that's there (in the vast majority of cases it'll just be <gene> -- i think the only way it can be missing is if you hard code a specific gene (e.g. in bin/test-germline-inference.py) and it isn't in the parameter directory you passed
            break
    return observed_counts  # raw per-{ACGT} counts for each position, summed over genes ("raw" as in not a weighted average over a bunch of genes as in read_mute_freqs_with_weights())
示例#43
0
def read_allele_prevalence_freqs(fname, debug=False):
    # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense
    allele_prevalence_freqs = {r: {} for r in utils.regions}
    with open(fname) as pfile:
        reader = csv.DictReader(pfile)
        for line in reader:
            allele_prevalence_freqs[utils.get_region(line["gene"])][line["gene"]] = float(line["freq"])
    for region in utils.regions:
        if len(allele_prevalence_freqs[region]) == 0:
            continue
        if debug:
            for gene, freq in allele_prevalence_freqs[region].items():
                print "%14.8f   %s" % (freq, utils.color_gene(gene))
        assert utils.is_normed(allele_prevalence_freqs[region])
    return allele_prevalence_freqs
示例#44
0
def read_allele_prevalence_freqs(fname, debug=False):
    # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense
    allele_prevalence_freqs = {r : {} for r in utils.regions}
    with open(fname) as pfile:
        reader = csv.DictReader(pfile)
        for line in reader:
            allele_prevalence_freqs[utils.get_region(line['gene'])][line['gene']] = float(line['freq'])
    for region in utils.regions:
        if len(allele_prevalence_freqs[region]) == 0:
            continue
        if debug:
            for gene, freq in allele_prevalence_freqs[region].items():
                print '%14.8f   %s' % (freq, utils.color_gene(gene))
        assert utils.is_normed(allele_prevalence_freqs[region])
    return allele_prevalence_freqs
示例#45
0
    def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False):
        out_str_list = []
        buff_str = (20 - len(gene)) * ' '
        out_str_list.append('%8s%s%s%9s%3s %6.0f        ' % (' ', utils.color_gene(gene), '', '', buff_str, score))
        out_str_list.append('%4d%4d   %s\n' % (glbounds[0], glbounds[1], self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]]))
        out_str_list.append('%46s  %4d%4d' % ('', qrbounds[0], qrbounds[1]))
        out_str_list.append('   %s ' % (utils.color_mutants(self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]])))
        if region != 'd':
            out_str_list.append('(%s %d)' % (utils.conserved_codons[region], codon_pos))
        if warnings[gene] != '':
            out_str_list.append('WARNING ' + warnings[gene])
        if skipping:
            out_str_list.append('skipping!')

        print ''.join(out_str_list)
示例#46
0
def choose_some_alleles(region, genes_to_use, allelic_groups, n_alleles_per_gene, debug=False):
    """ choose a gene (i.e. a primary and sub-version) from <allelic_groups>, and its attendant alleles """
    # NOTE also modifies <allelic_groups>

    if len(allelic_groups[region]) == 0:
        raise Exception(
            "ran out of %s alleles (either --n-genes-per-region or --n-alleles-per-gene are probably too big)" % region
        )

    available_versions = None
    while available_versions is None or len(available_versions) == 0:
        if available_versions is not None:
            print "  %s couldn't find any versions that have %d alleles, so trying again" % (
                utils.color("red", "warning"),
                n_alleles,
            )
        n_alleles = numpy.random.choice(n_alleles_per_gene[region])
        available_versions = [
            (pv, subv)
            for pv in allelic_groups[region]
            for subv in allelic_groups[region][pv]
            if len(allelic_groups[region][pv][subv]) >= n_alleles
        ]
    ichoice = (
        numpy.random.randint(0, len(available_versions) - 1) if len(available_versions) > 1 else 0
    )  # numpy.random.choice() can't handle list of tuples (and barfs if you give it only one thing to choose from)
    primary_version, sub_version = available_versions[ichoice]
    new_alleles = set(
        numpy.random.choice(list(allelic_groups[region][primary_version][sub_version]), size=n_alleles, replace=False)
    )
    if debug:
        print "      %8s %5s   %s" % (
            primary_version,
            sub_version,
            " ".join([utils.color_gene(g, width=15) for g in new_alleles]),
        )

    assert len(new_alleles & genes_to_use) == 0  # make sure none of the new alleles are already in <genes_to_use>
    genes_to_use |= new_alleles  # actually add them to the final set

    # remove stuff we've used from <allelic_groups>
    del allelic_groups[region][primary_version][
        sub_version
    ]  # remove this sub-version (we don't want any more alleles from it)
    if len(allelic_groups[region][primary_version]) == 0:
        del allelic_groups[region][primary_version]
示例#47
0
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False):
    n_skipped_pseudogenes = 0
    seq_to_gene_map = {}
    for seqfo in utils.read_fastx(fname):
        # first get gene name
        if seqfo['name'][:2] != 'IG' and seqfo['name'][:2] != 'TR':  # if it's an imgt file, with a bunch of header info (and the accession number first)
            gene = seqfo['infostrs'][imgt_info_indices.index('gene')]
            functionality = seqfo['infostrs'][imgt_info_indices.index('functionality')]
            if functionality not in functionalities:
                raise Exception('unexpected functionality %s in %s' % (functionality, fname))
            if skip_pseudogenes and functionality in pseudogene_funcionalities:
                n_skipped_pseudogenes += 1
                continue
        else:  # plain fasta with just the gene name after the '>'
            gene = seqfo['name']
        utils.split_gene(gene)  # just to check if it's a valid gene name
        if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)):  # if <aligned> is True, file name is expected to be whatever
            raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene)))
        if gene in seqs[utils.get_region(gene)]:
            raise Exception('gene name %s appears twice in %s' % (gene, fname))

        # then the sequence
        seq = seqfo['seq']
        if not aligned:
            seq = utils.remove_gaps(seq)
        if 'Y' in seq:
            print '      replacing Y --> N (%d of \'em) in %s' % (seq.count('Y'), utils.color_gene(gene))
            seq = seq.replace('Y', 'N')
        if len(seq.strip(''.join(utils.expected_characters))) > 0:  # return the empty string if it only contains expected characters
            raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters)))
        if seq not in seq_to_gene_map:
            seq_to_gene_map[seq] = []
        seq_to_gene_map[seq].append(gene)

        seqs[utils.get_region(gene)][gene] = seq

    tmpcounts = [len(gl) for gl in seq_to_gene_map.values()]  # number of names corresponding to each sequence (should all be ones)
    if tmpcounts.count(1) != len(tmpcounts):
        print '  mutliple names in %s for the following sequences:' % fname
        for seq, genelist in seq_to_gene_map.items():
            if len(genelist) > 1:
                print '    %-50s   %s' % (' '.join(genelist), seq)
        raise Exception('please de-duplicate the fasta and re-run.')

    if n_skipped_pseudogenes > 0:
        print '    skipped %d %s pseudogenes (leaving %d)' % (n_skipped_pseudogenes, utils.get_region(os.path.basename(fname)), len(seqs[utils.get_region(os.path.basename(fname))]))
示例#48
0
 def print_cluster(self, iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels):
     if iclust > 0:
         print ''
     print '    %-3d  %4d   %6.3f' % (iclust, len(clusterfo['seqfos']), mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j']),
     for igene in range(len(sorted_glcounts)):
         if igene > 0:
             print '%22s' % '',
         gene, counts = sorted_glcounts[igene]
         print '   %-s %4d      %2d%s' % (utils.color_gene(gene, width=20), counts, utils.hamming_distance(new_seq, self.glfo['seqs'][self.region][gene], align=True), ' (%s)' % utils.color('blue', 'x') if has_indels else '   '),
         if igene < len(sorted_glcounts) - 1 or self.reco_info is not None:
             print ''
     if self.reco_info is not None:
         for igene in range(len(true_sorted_glcounts)):
             gene, counts = true_sorted_glcounts[igene]
             print '%17s       %s %-s %4d %s    %2d   ' % ('', utils.color('green', '['), utils.color_gene(gene[:23], width=20), counts, utils.color('green', ']'), utils.hamming_distance(new_seq, self.simglfo['seqs'][self.region][gene], align=True)),
             if igene < len(true_sorted_glcounts) - 1:
                 print ''
示例#49
0
def check_allele_prevalence_freqs(outfname, glfo, allele_prevalence_fname, only_region=None):
    allele_prevalence_freqs = read_allele_prevalence_freqs(allele_prevalence_fname)
    counts = {r: {g: 0 for g in glfo["seqs"][r]} for r in utils.regions}
    with open(outfname) as outfile:
        reader = csv.DictReader(outfile)
        for line in reader:
            for region in utils.regions:
                counts[region][line[region + "_gene"]] += 1
    print "   checking allele prevalence freqs"
    for region in utils.regions:
        if only_region is not None and region != only_region:
            continue
        total = sum(counts[region].values())
        print "       %s   obs / tot  =  freq    expected" % region
        for gene in glfo["seqs"][region]:
            print "          %4d / %-4d = %.3f    %.3f   %s" % (
                counts[region][gene],
                total,
                float(counts[region][gene]) / total,
                allele_prevalence_freqs[region][gene],
                utils.color_gene(gene, width=15),
            )
    def write_hmms(self, parameter_dir, sw_matches):
        print 'writing hmms with info from %s' % parameter_dir
        start = time.time()
        from hmmwriter import HmmWriter
        hmm_dir = parameter_dir + '/hmms'
        utils.prep_dir(hmm_dir, '*.yaml')

        gene_list = self.args.only_genes
        if gene_list == None:  # if specific genes weren't specified, do the ones for which we have matches
            gene_list = []
            for region in utils.regions:
                for gene in self.germline_seqs[region]:
                    if sw_matches == None or gene in sw_matches:  # shouldn't be None really, but I'm testing something
                        gene_list.append(gene)

        for gene in gene_list:
            if self.args.debug:
                print '  %s' % utils.color_gene(gene)
            writer = HmmWriter(
                parameter_dir, hmm_dir, gene, self.args.naivety,
                self.germline_seqs[utils.get_region(gene)][gene], self.args)
            writer.write()

        print '    time to write hmms: %.3f' % (time.time() - start)
示例#51
0
    def write_hmms(self, parameter_dir, sw_matches):
        print 'writing hmms with info from %s' % parameter_dir
        start = time.time()
        from hmmwriter import HmmWriter
        hmm_dir = parameter_dir + '/hmms'
        utils.prep_dir(hmm_dir, '*.yaml')

        gene_list = self.args.only_genes
        if gene_list == None:  # if specific genes weren't specified, do the ones for which we have matches
            gene_list = []
            for region in utils.regions:
                for gene in self.germline_seqs[region]:
                    if sw_matches == None or gene in sw_matches:  # shouldn't be None really, but I'm testing something
                        gene_list.append(gene)

        for gene in gene_list:
            if self.args.debug:
                print '  %s' % utils.color_gene(gene)
            writer = HmmWriter(parameter_dir, hmm_dir, gene, self.args.naivety,
                               self.germline_seqs[utils.get_region(gene)][gene],
                               self.args)
            writer.write()

        print '    time to write hmms: %.3f' % (time.time()-start)
示例#52
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line['unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[ : self.args.indir.rfind('/')]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3:
                        failregions = re.findall('No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position+1)
                        pgraph = full_text[position : full_text.find('\n\n', position+1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:')
                utils.print_reco_event(self.germline_seqs, line, label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
示例#53
0
 def skip_gene(gene):
     print '    %s in list of genes to skip' % utils.color_gene(gene)
     if gene not in genes_actually_skipped:
         genes_actually_skipped[gene] = 0
     genes_actually_skipped[gene] += 1
     line['skip_gene'] = True
示例#54
0
    def get_positions_to_fit(self, gene, gene_results, debug=False):
        self.fitted_positions[gene] = set()

        positions = sorted(self.mfreqer.counts[gene].keys())
        xyvals = {pos : self.get_allele_finding_xyvals(gene, pos) for pos in positions}
        positions_to_try_to_fit = [pos for pos in positions if sum(xyvals[pos]['obs']) > self.n_muted_min or sum(xyvals[pos]['total']) > self.n_total_min]  # ignore positions with neither enough mutations or total observations
        if len(positions_to_try_to_fit) < self.n_max_snps - 1 + self.min_non_candidate_positions_to_fit:
            gene_results['not_enough_obs_to_fit'].add(gene)
            if debug:
                print '          not enough positions with enough observations to fit %s' % utils.color_gene(gene)
                return None, None
        if debug and len(positions) > len(positions_to_try_to_fit):
            print '          skipping %d / %d positions (with fewer than %d mutations and %d observations)' % (len(positions) - len(positions_to_try_to_fit), len(positions), self.n_muted_min, self.n_total_min)

        self.plotvals[gene] = {}
        for pos in positions_to_try_to_fit:
            self.plotvals[gene][pos] = xyvals[pos]

        return positions_to_try_to_fit, xyvals
示例#55
0
    def parse_query_text(self, unique_id, query_info):
        if len(query_info) == 0:  # one for the query sequence, then one for v, d, and j
            print 'no info for',unique_id
            return {}
        elif len(query_info) < 4:
            regions_ok = ''
            for info in query_info:
                for region in utils.regions:
                    if 'IGH' + region.upper() in info:
                        regions_ok += region
            for region in utils.regions:
                if region not in regions_ok:
                    print '    ERROR no %s matches' % region
                    return {}
            assert False  # shouldn't get here
        elif len(query_info) != 4:
            print 'info for', unique_id, 'all messed up'
            for info in query_info:
                print info
            sys.exit()

        full_qr_seq = query_info[0].replace('>', '').replace(unique_id, '')  # strip off the unique id
        full_qr_seq = ''.join(full_qr_seq.split()).upper()  # strip off white space and uppercase it
        assert full_qr_seq == self.seqinfo[unique_id]['seq']

        line = {}
        line['unique_id'] = unique_id
        line['seq'] = full_qr_seq
        for ireg in range(len(utils.regions)):
            region = utils.regions[ireg]
            info = query_info[ireg + 1].splitlines()
            while unique_id not in info[0]:  # remove the line marking cdr3 and framework regions
                info.pop(0)
            if len(info) <= 1:
                print info
            assert len(info) > 1
            assert len(info[0].split()) == 2
            qr_seq = info[0].split()[1].upper()  # this line should be '<unique_id> .............<query_seq>'

            true_gene = self.seqinfo[unique_id][region + '_gene']
            imatch = 1  # which match to take
            match_name = str(info[imatch].split()[2])
            while match_name in just_always_friggin_skip and len(info) > imatch+1 and len(info[imatch+1].split()) > 2:
                imatch += 1
                old_one = match_name
                match_name = str(info[imatch].split()[2])
                if self.args.debug:
                    print '    %s: taking next match: %s --> %s)' % (unique_id, utils.color_gene(old_one), utils.color_gene(match_name))

            infer_gene = match_name
            for gset in equivalent_genes:
                if match_name in gset and true_gene in gset and match_name != true_gene:  # if the true gene and the inferred gene are in the same equivalence set, treat it as correct, i.e. just pretend it inferred the right name
                    if self.args.debug:
                        print '   %s: replacing name %s with true name %s' % (unique_id, match_name, true_gene)
                    infer_gene = true_gene

            # ----------------------------------------------------------------------------------------
            # skipping bullshit
            def skip_gene(gene):
                print '    %s in list of genes to skip' % utils.color_gene(gene)
                if gene not in genes_actually_skipped:
                    genes_actually_skipped[gene] = 0
                genes_actually_skipped[gene] += 1
                line['skip_gene'] = True

            if infer_gene not in self.germline_seqs[region]:
                print '    couldn\'t find %s in germlines (skipping)' % infer_gene
                skip_gene(infer_gene)
                return line

            if infer_gene in just_always_friggin_skip:
                skip_gene(infer_gene)
                return line
            if true_gene in just_always_friggin_skip:
                skip_gene(true)
                return line

            if not self.args.dont_skip_or15_genes and '/OR1' in true_gene:
                skip_gene(true_gene)
                return line

            if self.args.skip_missing_genes:
                if infer_gene in genes_to_skip:
                    skip_gene(infer_gene)
                    return line
                if true_gene in genes_to_skip:
                    skip_gene(true_gene)
                    return line

            gl_seq = info[imatch].split()[4].upper()
            if qr_seq.replace('.', '') not in self.seqinfo[unique_id]['seq']:
                # if self.args.debug:
                print '    qr_seq not found in seqinfo'
                line['failed'] = True
                return line

            if self.args.debug:
                if utils.are_alleles(infer_gene, true_gene):
                    regionstr = utils.color('bold', utils.color('blue', region))
                    truestr = '(originally %s)' % match_name
                else:
                    regionstr = utils.color('bold', utils.color('red', region))
                    truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '')
                print '  %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr)
                print '    gl', gl_seq
                print '      ', qr_seq

            # replace the dots (gaps) in the gl match
            new_qr_seq, new_gl_seq = [], []
            for inuke in range(min(len(qr_seq), len(gl_seq))):
                if gl_seq[inuke] == '.':
                    pass
                else:
                    new_qr_seq.append(qr_seq[inuke])  # this should only be out of range if the v match extends through the whole query sequence, i.e. friggin never
                    new_gl_seq.append(gl_seq[inuke])
            for inuke in range(len(gl_seq), len(qr_seq)):
                new_qr_seq.append(qr_seq[inuke])
            for inuke in range(len(qr_seq), len(gl_seq)):
                new_gl_seq.append(gl_seq[inuke])
            qr_seq = ''.join(new_qr_seq)
            gl_seq = ''.join(new_gl_seq)

            # work out the erosions
            qr_ldots = qr_seq.rfind('.') + 1  # first strip off any dots on the left of query seq
            qr_seq = qr_seq[qr_ldots : ]
            gl_seq = gl_seq[qr_ldots : ]
            gl_ldots = gl_seq.rfind('.') + 1  # then remove dots on the left of the germline seq
            qr_seq = qr_seq[gl_ldots : ]
            gl_seq = gl_seq[gl_ldots : ]
            del_5p = qr_ldots + gl_ldots
            jf_insertion = ''
            if region == 'j':
                jf_insertion = qr_seq[len(gl_seq) : ]
            qr_seq = qr_seq[ : len(gl_seq)]  # then strip the right-hand portion of the query sequence that isn't aligned to the germline
            del_3p = len(gl_seq) - len(qr_seq)  # then do the same for the germline overhanging on the right of the query
            gl_seq = gl_seq[ : len(qr_seq)]
            assert len(gl_seq) == len(qr_seq)

            new_gl_seq = []
            for inuke in range(len(gl_seq)):  # replace dashes (matched bases)
                assert gl_seq[inuke] != '.'  # hoping there's no gaps in here
                if gl_seq[inuke] == '-':
                    new_gl_seq.append(qr_seq[inuke])
                else:
                    new_gl_seq.append(gl_seq[inuke])
            gl_seq = ''.join(new_gl_seq)

            if self.germline_seqs[region][infer_gene].find(gl_seq) != del_5p:  # why the *@*!! can't they make this consistent?
                if self.germline_seqs[region][infer_gene].find(gl_seq) < 0:
                    print 'whooooaa'
                    print self.germline_seqs[region][infer_gene]
                    print gl_seq
                    line['failed'] = True
                    return line
                del_5p += self.germline_seqs[region][infer_gene].find(gl_seq)

            try:
                assert del_5p + len(gl_seq) + del_3p + len(jf_insertion) == len(self.germline_seqs[region][infer_gene])
            except:
                print '    ERROR lengths failed for %s' % unique_id
                # print del_5p, len(gl_seq), del_3p, del_5p + len(gl_seq) + del_3p , len(self.germline_seqs[region][infer_gene])
                # print gl_seq
                # print self.germline_seqs[region][infer_gene]
                line['failed'] = True
                return line
                # assert False

            if self.args.debug:
                utils.color_mutants(gl_seq, qr_seq, ref_label='gl ', extra_str='    ', print_result=True, post_str='    del: %d %d' % (del_5p, del_3p))

            # try:
            #     infer_gene = joinparser.figure_out_which_damn_gene(self.germline_seqs, infer_gene, gl_seq, debug=self.args.debug)
            # except:
            #     print 'ERROR couldn\'t figure out the gene for %s' % infer_gene
            #     return {}

            line[region + '_gene'] = infer_gene
            line[region + '_qr_seq'] = qr_seq
            line[region + '_gl_seq'] = gl_seq
            line[region + '_5p_del'] = del_5p
            line[region + '_3p_del'] = del_3p
            if region == 'j':
                line['jf_insertion'] = jf_insertion
            
        return line
示例#56
0
            linefo = [p.replace('>', '').strip() for p in line.split('|')]
            gene = None
            for piece in linefo:
                if piece[:2] == 'IG':
                    gene = piece
            if gene is None:
                raise Exception('couldn\'t fine gene in %s' % line)

            if len(linefo) > 1:
                functionality = linefo[glutils.imgt_info_indices.index('functionality')]
                if functionality not in glutils.functionalities:
                    raise Exception('unexpected functionality %s' % functionality)
                if functionality == 'P':
                    n_skipped_pseudogenes += 1
                    continue

            genes[fname].add(gene)
        if n_skipped_pseudogenes > 0:
            print '    skipped %d pseudogenes' % n_skipped_pseudogenes

readfile(args.file1)
readfile(args.file2)

print 'file1: %d' % len(genes[args.file1])
print 'file2: %d' % len(genes[args.file2])
print 'both: %d' % len(genes[args.file1] & genes[args.file2])
only_file1 = genes[args.file1] - genes[args.file2]
print 'only file1: %d  (%s)' % (len(only_file1), ' '.join([utils.color_gene(g) for g in only_file1]))
only_file2 = genes[args.file2] - genes[args.file1]
print 'only file2: %d  (%s)' % (len(only_file2), ' '.join([utils.color_gene(g) for g in only_file2]))
示例#57
0
def get_genes_to_skip(fname, germlines, method='imgt', debug=False):
    with opener('r')(fname) as infile:
        if method == 'imgt':
            reader = csv.DictReader(infile, delimiter='\t')
            imgt_genes = set()  # genes that imgt spit out at least once
            iline = 0
            no_matches = {region:0 for region in utils.regions}
            for line in reader:
                iline += 1
                for region in utils.regions:
                    matchstr = line[region.upper() + '-GENE and allele']
                    if len(matchstr) == 0:
                        no_matches[region] += 1
                        # print '    no %s match' % region
                        continue
                    try:
                        gene = matchstr.split()[1]
                    except IndexError:
                        raise Exception('match problem in %s: %s' % (region, matchstr))
    
                    # print '%12s %s' % (gene in germlines[region], utils.color_gene(gene))
                    imgt_genes.add(gene)
    
                # if len(imgt_genes) > 10:
                #     # for g in imgt_genes:
                #     #     print utils.color_gene(g),
                #     break

            print 'read %d lines, no match (v/d/j): %d/%d/%d' % tuple([iline, ] + [no_matches[region] for region in utils.regions])

        elif method == 'igblast':
            filestr = infile.read()
            imgt_genes = set(re.findall('IGH[VDJ][^*]*\*[0-9][0-9]', filestr))  # ok, igblast genes, but it's not so bad to leave the variable name like that...
        else:
            raise Exception('bad method %s' % method)

        print '%s genes: ' % method,
        if debug:
            print ''
            for g in sorted(imgt_genes):
                print '  ', utils.color_gene(g)
        else:
            print len(imgt_genes)

        print '\nin %s output, not in simulation: ' % method
        for gene in sorted(imgt_genes):
            if gene not in germlines[utils.get_region(gene)]:
                if debug:
                    print '  ', utils.color_gene(gene)
                genes_to_skip.add(gene)
        if not debug:
            print len(genes_to_skip)

        print '\nin simulation, not in %s output: ' % method
        for region in utils.regions:
            for gene in sorted(germlines[region]):
                if gene not in imgt_genes:
                    if debug:
                        print '  ', utils.color_gene(gene)
                    genes_to_skip.add(gene)
        if not debug:
            print len(genes_to_skip)
        simulation_genes = set(germlines['v']) | set(germlines['d']) | set(germlines['j'])
        genes_to_use = imgt_genes & simulation_genes
        # print '\ngenes to use: %s' % len(genes_to_use)
        # if debug:
        #     for g in sorted(genes_to_use):
        #         print '  ', utils.color_gene(g)

        if len(genes_to_use & genes_to_skip) > 0:
            raise Exception('non zero intersection: %d' % len(genes_to_use & genes_to_skip))
示例#58
0
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False):
    n_skipped_pseudogenes = 0
    seq_to_gene_map = {}
    for seq_record in SeqIO.parse(fname, "fasta"):
        linefo = [p.strip() for p in seq_record.description.split("|")]

        # first get gene name
        if linefo[0][:2] != "IG":  # if it's an imgt file, with a bunch of header info (and the accession number first)
            gene = linefo[imgt_info_indices.index("gene")]
            functionality = linefo[imgt_info_indices.index("functionality")]
            if functionality not in functionalities:
                raise Exception("unexpected functionality %s in %s" % (functionality, fname))
            if skip_pseudogenes and functionality in pseudogene_funcionalities:
                n_skipped_pseudogenes += 1
                continue
        else:  # plain fasta with just the gene name after the '>'
            gene = linefo[0]
        utils.split_gene(gene)  # just to check if it's a valid gene name
        if not aligned and utils.get_region(gene) != utils.get_region(
            os.path.basename(fname)
        ):  # if <aligned> is True, file name is expected to be whatever
            raise Exception(
                "gene %s from %s has unexpected region %s" % (gene, os.path.basename(fname), utils.get_region(gene))
            )
        if gene in seqs[utils.get_region(gene)]:
            raise Exception("gene name %s appears twice in %s" % (gene, fname))

        # then the sequence
        seq = str(seq_record.seq).upper()
        if not aligned:
            seq = utils.remove_gaps(seq)
        if "Y" in seq:
            print "      replacing Y --> N (%d of 'em) in %s" % (seq.count("Y"), utils.color_gene(gene))
            seq = seq.replace("Y", "N")
        if (
            len(seq.strip("".join(utils.expected_characters))) > 0
        ):  # return the empty string if it only contains expected characters
            raise Exception(
                "unexpected character %s in %s (expected %s)"
                % (seq.strip("".join(utils.expected_characters)), seq, " ".join(utils.expected_characters))
            )
        if seq not in seq_to_gene_map:
            seq_to_gene_map[seq] = []
        seq_to_gene_map[seq].append(gene)

        seqs[utils.get_region(gene)][gene] = seq

    tmpcounts = [
        len(gl) for gl in seq_to_gene_map.values()
    ]  # number of names corresponding to each sequence (should all be ones)
    if tmpcounts.count(1) != len(tmpcounts):
        print "  mutliple names in %s for the following sequences:" % fname
        for seq, genelist in seq_to_gene_map.items():
            if len(genelist) > 1:
                print "    %-50s   %s" % (" ".join(genelist), seq)
        raise Exception("please de-duplicate the fasta and re-run.")

    if n_skipped_pseudogenes > 0:
        print "    skipped %d %s pseudogenes (leaving %d)" % (
            n_skipped_pseudogenes,
            utils.get_region(os.path.basename(fname)),
            len(seqs[utils.get_region(os.path.basename(fname))]),
        )
示例#59
0
def get_new_alignments(glfo, region, debug=False):
    aligned_seqs = {}

    genes_with_alignments = set(
        aligned_seqs
    )  # used to already have some sequences aligned, and may as well keep around the code to handle that case
    genes_without_alignments = set(glfo["seqs"][region]) - set(aligned_seqs)
    if len(genes_without_alignments) == 0:
        if debug:
            print "  no missing %s alignments" % region
        return

    if debug:
        print "        missing alignments for %d %s genes" % (len(genes_without_alignments), region)
        if len(aligned_seqs) > 0:
            print "      existing alignments:"
            for g, seq in aligned_seqs.items():
                print "    %s   %s" % (seq, utils.color_gene(g))

    # find the longest aligned sequence, so we can pad everybody else with dots on the right out to that length
    biggest_length = None
    for gene in genes_with_alignments:
        if biggest_length is None or len(aligned_seqs[gene]) > biggest_length:
            biggest_length = len(aligned_seqs[gene])

    tmpdir = tempfile.mkdtemp()
    already_aligned_fname = tmpdir + "/already-aligned.fasta"
    not_aligned_fname = tmpdir + "/not-aligned.fasta"
    msa_table_fname = tmpdir + "/msa-table.txt"
    aligned_and_not_fnamefname = tmpdir + "/aligned-and-not.fasta"
    mafft_outfname = tmpdir + "/everybody-aligned.fasta"
    with open(already_aligned_fname, "w") as tmpfile, open(msa_table_fname, "w") as msafile:
        mysterious_index = 1
        msa_str = ""
        for gene in genes_with_alignments:
            dotstr = "." * (biggest_length - len(aligned_seqs[gene]))
            alistr = aligned_seqs[gene] + dotstr
            tmpfile.write(">%s\n%s\n" % (gene, alistr.replace(".", "-")))
            msa_str += " " + str(mysterious_index)
            mysterious_index += 1
        msafile.write("%s # %s\n" % (msa_str, already_aligned_fname))
    with open(not_aligned_fname, "w") as tmpfile:
        for gene in genes_without_alignments:
            tmpfile.write(">%s\n%s\n" % (gene, glfo["seqs"][region][gene]))

    check_call("cat " + already_aligned_fname + " " + not_aligned_fname + " >" + aligned_and_not_fnamefname, shell=True)

    # actually run mafft
    cmd = (
        "mafft --merge " + msa_table_fname + " " + aligned_and_not_fnamefname + " >" + mafft_outfname
    )  # options=  # "--localpair --maxiterate 1000"
    if debug:
        print "          RUN %s" % cmd
    proc = Popen(cmd, shell=True, stderr=PIPE)
    _, err = proc.communicate()  # debug info goes to err

    if debug and False:  # aw, screw it, I don't even know what any of mafft's output means
        # deal with debug info (for err -- out gets redirected to a file)
        err = err.replace("\r", "\n")
        printstrs = []
        for errstr in err.split("\n"):  # remove the stupid progress bar things
            matches = re.findall("[0-9][0-9]* / [0-9][0-9]*", errstr)
            if len(matches) == 1 and errstr.strip() == matches[0]:
                continue
            if len(errstr) == 0:
                continue
            printstrs.append(errstr)
        print "        " + "\n        ".join(printstrs)

    # deal with fasta output
    for seq_record in SeqIO.parse(mafft_outfname, "fasta"):
        gene = seq_record.name.split("|")[0]
        seq = str(seq_record.seq).upper()
        if (
            gene not in glfo["seqs"][region]
        ):  # only really possible if there's a bug in the preceding fifty lines, but oh well, you can't be too careful
            raise Exception("unexpected gene %s in mafft output" % gene)
        aligned_seqs[gene] = seq  # overwrite the old alignment with the new one
    if debug and False:  # too damn verbose with all the v genes
        print "  new alignments:"
        for g, seq in aligned_seqs.items():
            print "    %s   %s  %s" % (seq, utils.color_gene(g), "<--- new" if g in genes_without_alignments else "")

    os.remove(already_aligned_fname)
    os.remove(not_aligned_fname)
    os.remove(msa_table_fname)
    os.remove(aligned_and_not_fnamefname)
    os.remove(mafft_outfname)
    os.rmdir(tmpdir)

    return aligned_seqs
示例#60
0
def remove_genes(glfo, genes, debug=False):
    """ remove <genes> from <glfo> """
    if debug:
        print "  removing %s from glfo" % " ".join([utils.color_gene(g) for g in genes])
    for gene in genes:
        remove_gene(glfo, gene)