示例#1
0
 def read_mute_freq_stuff(self, gene_or_insert_name):
     if self.args.mutate_from_scratch:  # XXX GODDAMMIT i remember putting this 'xxx' here for a reason and I have no f*****g clue what it was
         self.all_mute_freqs[gene_or_insert_name] = {
             'overall_mean': self.args.flat_mute_freq
         }
     elif gene_or_insert_name[:2] in utils.boundaries:
         replacement_genes = utils.find_replacement_genes(
             self.parameter_dir, min_counts=-1, all_from_region='v')
         self.all_mute_freqs[
             gene_or_insert_name], _ = paramutils.read_mute_info(
                 self.parameter_dir,
                 this_gene=gene_or_insert_name,
                 locus=self.args.locus,
                 approved_genes=replacement_genes)
     else:
         gene_counts = utils.read_overall_gene_probs(
             self.parameter_dir,
             only_gene=gene_or_insert_name,
             normalize=False,
             expect_zero_counts=True)
         replacement_genes = None
         if gene_counts < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
             replacement_genes = utils.find_replacement_genes(
                 self.parameter_dir,
                 min_counts=self.args.min_observations_to_write,
                 gene_name=gene_or_insert_name)
         self.all_mute_freqs[
             gene_or_insert_name], _ = paramutils.read_mute_info(
                 self.parameter_dir,
                 this_gene=gene_or_insert_name,
                 locus=self.args.locus,
                 approved_genes=replacement_genes)
示例#2
0
 def read_mute_freq_stuff(self, gene_or_insert_name):
     if gene_or_insert_name[:2] in utils.boundaries:
         replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=-1, all_from_region='v')
         self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, approved_genes=replacement_genes)
     else:
         gene_counts = utils.read_overall_gene_probs(self.parameter_dir, only_gene=gene_or_insert_name, normalize=False, expect_zero_counts=True)
         replacement_genes = None
         if gene_counts < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
             replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_or_insert_name, single_gene=False)
         self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, approved_genes=replacement_genes)
示例#3
0
 def read_mute_freq_stuff(self, gene_or_insert_name):
     if self.args.mutate_from_scratch:  # XXX GODDAMMIT i remember putting this 'xxx' here for a reason and I have no f*****g clue what it was
         self.all_mute_freqs[gene_or_insert_name] = {'overall_mean' : self.args.flat_mute_freq}
     elif gene_or_insert_name[:2] in utils.boundaries:
         replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=-1, all_from_region='v')
         self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, chain=self.args.chain, approved_genes=replacement_genes)
     else:
         gene_counts = utils.read_overall_gene_probs(self.parameter_dir, only_gene=gene_or_insert_name, normalize=False, expect_zero_counts=True)
         replacement_genes = None
         if gene_counts < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
             replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_or_insert_name)
         self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, chain=self.args.chain, approved_genes=replacement_genes)
示例#4
0
    def read_mute_freq_stuff(self, gene):
        assert gene[:
                    2] not in utils.boundaries  # make sure <gene> isn't actually an insertion (we used to pass insertions in here separately, but now they're smooshed onto either end of d)
        if self.args.mutate_from_scratch:
            self.all_mute_freqs[gene] = {
                'overall_mean': self.args.scratch_mute_freq
            }
        else:
            approved_genes = [gene]

            # ok this is kind of dumb, but I need to figure out how many counts there are for this gene, even when we have only an shm parameter dir
            tmp_reco_param_dir = self.reco_parameter_dir if self.reco_parameter_dir is not None else self.shm_parameter_dir  # will crash if the shm parameter dir doesn't have gene count info... but we should only end up using it on data/recombinator/scratch-parameters
            gene_counts = utils.read_overall_gene_probs(
                tmp_reco_param_dir,
                only_gene=gene,
                normalize=False,
                expect_zero_counts=True)
            if gene_counts < self.args.min_observations_per_gene:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene> isn't in the dict, it's because it's in <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
                approved_genes += utils.find_replacement_genes(
                    tmp_reco_param_dir,
                    min_counts=self.args.min_observations_per_gene,
                    gene_name=gene)

            self.all_mute_freqs[
                gene] = paramutils.read_mute_freqs_with_weights(
                    self.shm_parameter_dir, approved_genes)
示例#5
0
    def __init__(self, base_indir, outdir, gene_name, naivety, glfo, args):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.cyst_positions = glfo['cyst-positions']
        self.tryp_positions = glfo['tryp-positions']

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.naivety = naivety
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}
        self.insertion_content_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = mean_freq_hist.get_mean()
示例#6
0
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args):
        self.indir = base_indir
        self.args = args

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries

        self.v_3p_del_pseudocount_limit = 10  # add at least one entry 

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.region = utils.get_region(gene_name)
        self.naivety = naivety
        self.germline_seq = germline_seq
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if self.allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if self.allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', list(utils.nukes))
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, {'nukes':list(utils.nukes)})  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
示例#7
0
    def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False):
        """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """
        replacement_genes = None
        if is_insertion:
            replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v')
        else:
            n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
            if n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
                # print '    only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences)
                replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False)

        mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes)
        rates = []  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p']
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(seq)  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with opener('w')(reco_seq_fname) as reco_seq_file:
            reco_seq_file.write('state\trate\n')
            for inuke in range(len(seq)):
                reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
    def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False):
        """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """
        replacement_genes = None
        if is_insertion:
            replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v')
        else:
            n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
            if n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
                # print '    only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences)
                replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False)

        mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes)
        rates = []  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p']
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(seq)  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with opener('w')(reco_seq_fname) as reco_seq_file:
            reco_seq_file.write('state\trate\n')
            for inuke in range(len(seq)):
                reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
示例#9
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, locus=self.args.locus, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
示例#10
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
示例#11
0
 def read_mute_freq_stuff(self, gene):
     assert gene[:
                 2] not in utils.boundaries  # make sure <gene> isn't actually an insertion (we used to pass insertions in here separately, but now they're smooshed onto either end of d)
     if self.args.mutate_from_scratch:
         self.all_mute_freqs[gene] = {
             'overall_mean':
             self.args.default_scratch_mute_freq if
             self.args.flat_mute_freq is None else self.args.flat_mute_freq
         }
     else:
         gene_counts = utils.read_overall_gene_probs(
             self.parameter_dir,
             only_gene=gene,
             normalize=False,
             expect_zero_counts=True)
         approved_genes = [gene]
         if gene_counts < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
             approved_genes += utils.find_replacement_genes(
                 self.parameter_dir,
                 min_counts=self.args.min_observations_to_write,
                 gene_name=gene)
         self.all_mute_freqs[
             gene] = paramutils.read_mute_freqs_with_weights(
                 self.parameter_dir, approved_genes)
示例#12
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][
            gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {
            r: glfo[c + '-positions']
            for r, c in utils.conserved_codons[args.locus].items()
        }

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {
            'fv': 1.5,
            'jf': 25
        }  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths
        self.mute_freq_bounds = {
            'lo': 0.01,
            'hi': 0.5
        }  # don't let any position mutate less frequently than 1% of the time, or more frequently than half the time
        self.enforced_flat_mfreq_length = {  # i.e. distance over which the mute freqs are typically screwed up. I'm not really sure why these vary so much, but it's probably to do with how the s-w step works
            'v_3p' : 9,
            'd_5p' : 9,
            'd_3p' : 9,
            'j_5p' : 20,
        }

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[
            0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(
            self.indir, gene_name, debug=self.debug
        )  # how many times did we observe this gene in data?
        approved_genes = [gene_name]
        # NOTE this never happens any more, since partitiondriver.cache_parameters() resets <args.min_observations_to_write> if it's arger than 10*(number of sequences)
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average also over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (
                    self.n_occurences, self.args.min_observations_to_write)
            approved_genes += utils.find_replacement_genes(
                self.indir,
                self.args.min_observations_to_write,
                gene_name,
                debug=self.debug)

        self.erosion_probs = self.read_erosion_info(approved_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(
            approved_genes)
        self.mute_freqs = paramutils.read_mute_freqs_with_weights(
            self.indir, approved_genes)  # weighted averages over genes
        self.mute_counts = paramutils.read_mute_counts(
            self.indir, gene_name, self.args.locus)  # raw per-{ACGT} counts
        self.process_mutation_info(
        )  # smooth/interpolation/whatnot for <self.mute_freqs> and <self.mute_counts>
        # NOTE i'm using a hybrid approach with mute_freqs and mute_counts -- the only thing I get from mute_counts is the ratios of the different bases, whereas the actual freq comes from mute_freqs (which has all the corrections/smooth/bullshit)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(
            self.saniname, self.track.getdict()
        )  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(
            self.eps,
            utils.read_overall_gene_probs(self.indir, only_gene=gene_name)
        )  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir +
                                  '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
        self.hmm.extras['per_gene_mute_freq'] = self.mute_freqs[
            'unweighted_overall_mean']  # the other (weighted) one might be technically more accurate, depending on what you want, but it's probably not what anyone is expecting, so we write the unweighted one
示例#13
0
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq,
                 args):
        self.indir = base_indir
        self.args = args

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.allow_unphysical_insertions = self.args.allow_unphysical_insertions  # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries

        self.v_3p_del_pseudocount_limit = 10  # add at least one entry

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.region = utils.get_region(gene_name)
        self.naivety = naivety
        self.germline_seq = germline_seq
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if self.allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if self.allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(
            self.indir, only_gene=gene_name, normalize=False
        )  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(
                self.indir,
                self.args.min_observations_to_write,
                gene_name,
                single_gene=False,
                debug=self.args.debug)

        self.read_erosion_info(
            gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs = paramutils.read_mute_info(
                self.indir,
                this_gene=gene_name,
                approved_genes=replacement_genes)

        self.track = Track('nukes', list(utils.nukes))
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(
            self.saniname, {'nukes': list(utils.nukes)}
        )  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(
            self.eps,
            utils.read_overall_gene_probs(self.indir, only_gene=gene_name)
        )  # if we really didn't see this gene at all, take pity on it and kick it an eps
示例#14
0
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seqs, args, cyst_positions, tryp_positions):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = germline_seqs  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.cyst_positions = cyst_positions
        self.tryp_positions = tryp_positions

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.naivety = naivety
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if not self.args.dont_allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if not self.args.dont_allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}
        self.insertion_content_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        mean_freq_hist = plotting.make_hist_from_bin_entry_file(self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = mean_freq_hist.GetMean()