Exemplo n.º 1
0
    def reassign_template_counts(self, msa_info, new_alleles, debug=False):
        # XXX need to update family_groups here
        if len(new_alleles) == 0:
            return

        if debug:
            print '              template  new'
            print '      size      snps    snps    assigned',
            if self.reco_info is not None:
                print '         true',
            print ''

        dbg_print = debug  # don't print all the tiny clusters
        templates = {newfo['template-gene'] : newfo['gene'] for newfo in new_alleles.values()}
        self.adjusted_glcounts = {}
        for clusterfo in sorted(msa_info, key=lambda cfo: len(cfo['seqfos']), reverse=True):
            sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo)  # it would be nice to not re-call this for the clusters we already called it on above
            for gene, counts in sorted_glcounts:  # <gene> is the one assigned by sw before allele clustering
                if debug and len(clusterfo['seqfos']) < 5:
                    if dbg_print:
                        print '     not printing clusters smaller than 5'
                    dbg_print = False

                if gene not in self.adjusted_glcounts:  # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts
                    self.adjusted_glcounts[gene] = 0
                if gene in templates:  # if this was a template for a new allele, we have to decide whether to apportion some or all of the sequences in this cluster to that new allele
                    template_gene = gene
                    template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene)
                    cons_seq = clusterfo['cons_seq']
                    template_seq = self.glfo['seqs'][self.region][template_gene]
                    new_allele_seq = new_alleles[templates[template_gene]]['seq']

                    compare_len = min([template_cpos, len(cons_seq), len(template_seq), len(new_allele_seq)])  # NOTE this doesn't account for indels, i.e. the template and consensus sequences are in general different lengths, but that's ok, it'll just inflate the hamming distance for sequences that differ from consensus by indels, and all we care is finding the one that doesn't have any indels
                    n_template_snps = utils.hamming_distance(cons_seq[:compare_len], template_seq[:compare_len])
                    n_new_snps = utils.hamming_distance(cons_seq[:compare_len], new_allele_seq[:compare_len])

                    if debug and dbg_print:
                        print '    %5d      %3d     %3d' % (len(clusterfo['seqfos']), n_template_snps, n_new_snps),

                    if n_new_snps < n_template_snps:  # reassign to the new allele
                        gene = templates[template_gene]
                        if gene not in self.adjusted_glcounts:  # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts
                            self.adjusted_glcounts[gene] = 0

                    if debug and dbg_print:
                        print '    %s' % utils.color_gene(gene, width=15),
                        if self.reco_info is not None:
                            true_gene = true_sorted_glcounts[0][0]  # NOTE this is the most *common* simulated gene in the cluster, not necessarily the one corresponding to these particular sequences... but clusters with new alleles should generally be dominated by one gene, so oh, well
                            if true_gene == gene:
                                print '    %s' % utils.color('green', 'ok'),
                            else:
                                print '    %s' % utils.color_gene(true_gene, width=15),
                        print ''

                self.adjusted_glcounts[gene] += counts

        if debug:
            print '  final counts:'
            for gene, counts in sorted(self.adjusted_glcounts.items(), key=operator.itemgetter(1), reverse=True):
                print '    %4d  %s' % (counts, utils.color_gene(gene))
Exemplo n.º 2
0
    def try_scratch_erode_insert(self, tmpline, debug=False):
        utils.remove_all_implicit_info(tmpline)
        for erosion in utils.real_erosions:  # includes various contortions to avoid eroding the entire gene
            region = erosion[0]
            gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']])
            if region == 'd' and not utils.has_d_gene(self.args.locus):  # dummy d genes: always erode the whole thing from the left
                assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.locus]
                tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0
            else:
                max_erosion = max(0, gene_length/2 - 2)  # heuristic
                if region in utils.conserved_codons[self.args.locus]:  # make sure not to erode a conserved codon
                    codon_pos = utils.cdn_pos(self.glfo, region, tmpline[region + '_gene'])
                    if '3p' in erosion:
                        n_bases_to_codon = gene_length - codon_pos - 3
                    elif '5p' in erosion:
                        n_bases_to_codon = codon_pos
                    max_erosion = min(max_erosion, n_bases_to_codon)
                tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1)
        for bound in utils.boundaries:
            mean_length = utils.scratch_mean_insertion_lengths[self.args.locus][bound]
            length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1
            probs = [self.insertion_content_probs[bound][n] for n in utils.nukes]
            tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs))

        if debug:
            print '    erosions:  %s' % ('   '.join([('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions]))
            print '    insertions:  %s' % ('   '.join([('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries]))

        # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator)
        gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions}
        for erosion in utils.real_erosions:
            region = erosion[0]
            e_length = tmpline[erosion + '_del']
            if '5p' in erosion:
                gl_seqs[region] = gl_seqs[region][e_length:]
            elif '3p' in erosion:
                gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length]
        tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ]
        tmpline['unique_ids'] = [None]  # this is kind of hackey, but some things in the implicit info adder use it to get the number of sequences
        tmpline['input_seqs'] = copy.deepcopy(tmpline['seqs'])  # NOTE has to be updated _immediately_ so seqs and input_seqs don't get out of sync
        tmpline['indelfos'] = [indelutils.get_empty_indel(), ]
        utils.add_implicit_info(self.glfo, tmpline)
        assert len(tmpline['in_frames']) == 1
Exemplo n.º 3
0
    def get_alleles(self, swfo, plotdir=None, debug=False):
        print 'clustering for new alleles'

        # NOTE do *not* modify <self.glfo> (in the future it would be nice to just modify <self.glfo>, but for now we need it to be super clear in partitiondriver what is happening to <self.glfo>)
        default_initial_glfo = self.glfo
        if self.args.default_initial_germline_dir is not None:  # if this is set, we want to take any new allele names from this directory's glfo if they're in there
            default_initial_glfo = glutils.read_glfo(self.args.default_initial_germline_dir, self.glfo['locus'])
            glfo_to_modify = copy.deepcopy(default_initial_glfo)  # so we can add new genes to it, so we can check for equivalency more easily TODO fix that shit, obviously
        else:
            print '  %s --default-initial-germline-dir isn\'t set, so new allele names won\'t correspond to existing names' % utils.color('yellow', 'warning')

        qr_seqs, threshold = self.choose_clonal_representatives(swfo, debug=debug)
        if qr_seqs is None:
            return {}

        # self.check_for_donuts(debug=debug)

        if not self.args.kmeans_allele_cluster:
            clusterfos, msa_info = self.vsearch_cluster_v_seqs(qr_seqs, threshold, debug=debug)
        else:
            clusterfos = self.kmeans_cluster_v_seqs(qr_seqs, swfo, plotdir=plotdir, debug=debug)
            msa_info = clusterfos

        # and finally loop over each cluster, deciding if it corresponds to a new allele
        if debug:
            print '  looping over %d clusters with %d sequences' % (len(clusterfos), sum([len(cfo['seqfos']) for cfo in clusterfos]))
            print '   rank  seqs   v/j mfreq                 seqs      snps (%s)' % utils.color('blue', 'indels')
        new_alleles = {}
        n_existing_gene_clusters = 0
        for iclust in range(len(clusterfos)):
            clusterfo = clusterfos[iclust]

            # dot_products = [utils.dot_product(clusterfo['cons_seq'], seq1, seq2) for seq1, seq2 in itertools.combinations([seqfo['seq'] for seqfo in clusterfo['seqfos']], 2)]
            # mean_dot_product = numpy.average(dot_products)

            # choose the most common existing gene to use as a template (the most similar gene might be a better choice, but deciding on "most similar" would involve adjudicating between snps and indels, and it shouldn't really matter)
            sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo)
            template_gene, template_counts = sorted_glcounts[0]
            template_seq = self.glfo['seqs'][self.region][template_gene]
            template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene)

            assert '.' not in clusterfo['cons_seq']  # make sure you haven't switched to something that doesn't use '-' for gap chars
            new_seq = clusterfo['cons_seq'].replace('-', '')  # I'm not sure that I completely understand the dashes in this sequence, but it seems to be right to just remove 'em

            aligned_template_seq, aligned_new_seq = utils.align_seqs(template_seq, clusterfo['cons_seq'])
            has_indels = '-' in aligned_template_seq.strip('-') or '-' in aligned_new_seq.strip('-')  # only counts internal indels
            cluster_mfreqs = {r : [self.mfreqs[r][seqfo['name']] for seqfo in clusterfo['seqfos']] for r in self.mfreqs}  # regional mfreqs for each sequence in the cluster corresponding to the initially-assigned existing gene
            mean_cluster_mfreqs = {r : numpy.mean(cluster_mfreqs[r]) for r in cluster_mfreqs}

            equiv_name, equiv_seq = glutils.find_equivalent_gene_in_glfo(glfo_to_modify, new_seq, template_cpos)
            if equiv_name is not None:
                new_name = equiv_name
                new_seq = equiv_seq
            else:
                new_name, _ = glutils.choose_new_allele_name(template_gene, new_seq, indelfo={'indels' : ['xxx', 'xxx', 'xxx']} if has_indels else None)  # the fcn just checks to see if it's non-None and of length greater than zero...TODO it would be nice to figure out actual snp and indel info

            if debug:
                self.print_cluster(iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels)

            if new_name in self.glfo['seqs'][self.region]:  # note that this only looks in <self.glfo>, not in <new_alleles>
                n_existing_gene_clusters += 1
                if debug:
                    print 'existing %s' % utils.color_gene(new_name)
                continue

            if new_name in new_alleles:  # already added it NOTE might make more sense to use <glfo_to_modify> here instead of <new_alleles> (or just not have freaking both of them)
                if debug:
                    print '%s (%s)' % (utils.color_gene(new_name), utils.color('red', 'new'))
                continue
            assert new_seq not in new_alleles.values()  # if it's the same seq, it should've got the same damn name

            if not has_indels:  # we assume that the presence of indels somewhat precludes false positives, which is equivalent to an assumption about the rarity of shm indels
                if self.too_close_to_existing_glfo_gene(clusterfo, new_seq, template_seq, template_cpos, template_gene, debug=debug):  # presumably if it were really close to another (non-template) existing glfo gene, that one would've been the template
                    continue

                if mean_cluster_mfreqs['j'] > 0. and self.mean_mfreqs['j'] > 0.:
                    this_cluster_ratio = mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j']
                    overall_ratio = self.mean_mfreqs['v'] / self.mean_mfreqs['j']
                    if this_cluster_ratio / overall_ratio < self.mfreq_ratio_threshold:
                        if debug:
                            print 'v / j cluster mfreqs too small %6.3f / %6.3f = %6.3f < %6.3f' % (this_cluster_ratio, overall_ratio, this_cluster_ratio / overall_ratio, self.mfreq_ratio_threshold)
                        continue

            if self.too_close_to_already_added_gene(new_seq, new_alleles, debug=debug):  # this needs to be applied even if there are indels, since the indels are with respect to the (existing glfo) template gene, not to the [potentially] previously-added gene
                continue

            print '%s %s%s' % (utils.color('red', 'new'), utils.color_gene(new_name), ' (exists in default germline dir)' if new_name in default_initial_glfo['seqs'][self.region] else '')
            new_alleles[new_name] = {'template-gene' : template_gene, 'gene' : new_name, 'seq' : new_seq}
            if new_alleles[new_name]['gene'] not in glfo_to_modify['seqs'][self.region]:  # if it's in <default_initial_glfo> it'll already be in there
                glutils.add_new_allele(glfo_to_modify, new_alleles[new_name])  # just so we can check for equivalency

        if debug:
            print '  %d / %d clusters consensed to existing genes' % (n_existing_gene_clusters, len(msa_info))

        self.reassign_template_counts(msa_info, new_alleles, debug=False)
        for new_name, newfo in new_alleles.items():
            # print '%s  %s  %.1f / %.1f = %.4f' % (new_name, newfo['template-gene'], self.adjusted_glcounts[newfo['template-gene']], float(sum(self.adjusted_glcounts.values())), self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())))
            if self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())) < self.args.min_allele_prevalence_fraction:  # NOTE self.adjusted_glcounts only includes large clusters, and the constituents of those clusters are clonal representatives, so this isn't quite the same as in alleleremover
                newfo['remove-template-gene'] = True

        return new_alleles
Exemplo n.º 4
0
    print '%s:' % utils.color('yellow', name)
    glfos.append(glutils.read_glfo(gldir, args.locus, debug=True))

for region in [r for r in utils.regions if r in glfos[0]['seqs']]:
    aset, bset = [set(g['seqs'][region]) for g in glfos]

    tmpfo = glutils.get_empty_glfo(
        args.locus)  # make a new glfo that will only have non-shared genes
    for glabel, gset, gfo in zip(
            args.names, [aset - bset, bset - aset],
            glfos):  # <gset> is the genes that're only in <glabel>
        for ogene in gset:
            glutils.add_new_allele(tmpfo, {
                'gene': '+'.join([ogene, glabel]),
                'seq': gfo['seqs'][region][ogene],
                'cpos': utils.cdn_pos(gfo, region, ogene)
            },
                                   use_template_for_codon_info=False)

    # eh, maybe this doesn't really add anything?
    # # add the nearest genes that they both have for comparison NOTE this gives one comparison gene for *each* gene, so usually you get a bunch of comparison/'both' genes in each block in the ascii output
    # for bgene in aset & bset:
    #     _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfos[0], glfos[0]['seqs'][region][bgene], new_cpos=utils.cdn_pos(glfos[0], region, bgene))  # i think it doesn't matter which glfo we get it from, so arbitrarily choose the first one
    #     glutils.add_new_allele(tmpfo, {'gene' : '+'.join([nearest_gene, 'both']), 'seq' : glfos[0]['seqs'][region][nearest_gene], 'cpos' : utils.cdn_pos(glfos[0], region, bgene)}, use_template_for_codon_info=False)

    print '%s: only in:\n      %12s: %2d  %s\n      %12s: %2d  %s' % (
        utils.color('green', region), args.names[0], len(aset - bset),
        utils.color_genes(sorted(aset - bset)), args.names[1],
        len(bset - aset), utils.color_genes(sorted(bset - aset)))
    if len(tmpfo['seqs'][region]) > 0:
        print ' comparing to nearest genes that were in both (labeled \'both\'):'
Exemplo n.º 5
0
    def plot(self, plotdir, only_csv=False, only_overall=False):
        import plotting
        if not self.finalized:
            self.finalize()

        overall_plotdir = plotdir + '/overall'

        for gene in self.freqs:
            if only_overall:
                continue
            freqs = self.freqs[gene]
            if len(freqs) == 0:
                if gene not in glutils.dummy_d_genes.values():
                    print '    %s no mutefreqer obs for %s' % (utils.color(
                        'red', 'warning'), utils.color_gene(gene))
                continue
            sorted_positions = sorted(freqs.keys())
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1,
                            sorted_positions[0] - 0.5,
                            sorted_positions[-1] + 0.5,
                            xtitle='position',
                            ytitle='mut freq',
                            title=gene)
            for position in sorted_positions:
                hi_diff = abs(freqs[position]['freq'] -
                              freqs[position]['freq_hi_err'])
                lo_diff = abs(freqs[position]['freq'] -
                              freqs[position]['freq_lo_err'])
                err = 0.5 * (hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position),
                                  freqs[position]['freq'],
                                  error=err)
            xline = None
            figsize = [7, 4]
            if utils.get_region(gene) in utils.conserved_codons[
                    self.glfo['locus']]:
                xline = utils.cdn_pos(self.glfo, utils.get_region(gene), gene)
            if utils.get_region(gene) == 'v':
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j':
                figsize[0] *= 2
            plotting.draw_no_root(self.per_gene_mean_rates[gene],
                                  plotdir=plotdir + '/per-gene/' +
                                  utils.get_region(gene),
                                  plotname=utils.sanitize_name(gene),
                                  errors=True,
                                  write_csv=True,
                                  only_csv=only_csv,
                                  shift_overflows=True)
            # per-position plots:
            plotting.draw_no_root(genehist,
                                  plotdir=plotdir + '/per-gene-per-position/' +
                                  utils.get_region(gene),
                                  plotname=utils.sanitize_name(gene),
                                  errors=True,
                                  write_csv=True,
                                  xline=xline,
                                  figsize=figsize,
                                  only_csv=only_csv,
                                  shift_overflows=True)
            # # per-position, per-base plots:
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment

        # make mean mute freq hists
        for rstr in ['all', 'cdr3'] + utils.regions:
            if rstr == 'all':
                bounds = (0.0, 0.4)
            else:
                bounds = (0.0, 0.6 if rstr == 'd' else 0.4)
            plotting.draw_no_root(self.mean_rates[rstr],
                                  plotname=rstr + '_mean-freq',
                                  plotdir=overall_plotdir,
                                  stats='mean',
                                  bounds=bounds,
                                  write_csv=True,
                                  only_csv=only_csv,
                                  shift_overflows=True)
            plotting.draw_no_root(self.mean_n_muted[rstr],
                                  plotname=rstr + '_mean-n-muted',
                                  plotdir=overall_plotdir,
                                  stats='mean',
                                  write_csv=True,
                                  only_csv=only_csv,
                                  shift_overflows=True)

        if not only_csv:  # write html file and fix permissiions
            for substr in self.subplotdirs:
                plotting.make_html(plotdir + '/' + substr)
Exemplo n.º 6
0
    # then add any that are only in the second one
    for gene, seq in glfos[1]['seqs'][region].items():
        if gene not in glfos[0]['seqs'][region]:
            cpos = glfos[1][utils.cdn(glfos[1], region) +
                            '-positions'][gene] if utils.cdn(
                                glfos[1], region) is not None else None
            glutils.add_new_allele(
                tmp_glfo, {
                    'gene': '+'.join([gene, args.names[1]]),
                    'seq': seq,
                    'cpos': cpos
                },
                use_template_for_codon_info=False
            )  # can't use template cause we might've deleted it in the first loop

    # then add the nearest genes that they both have for comparison
    for gene, seq in tmp_glfo['seqs'][region].items():
        _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(
            glfos[0], seq, new_cpos=utils.cdn_pos(tmp_glfo, region, gene)
        )  # i think it doesn't matter which glfo we get it from, so arbitrarily choose the first one
        glutils.add_new_allele(
            tmp_glfo, {
                'gene': '+'.join([nearest_gene, 'both']),
                'seq': glfos[0]['seqs'][region][nearest_gene],
                'template-gene': gene
            })

    if len(tmp_glfo['seqs'][region]) > 0:
        print ' comparing to nearest genes that were in both (labeled \'both\'):'
        glutils.print_glfo(tmp_glfo, only_region=region)