def read_mute_freqs(self, mute_freq_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now for mtype in ['all',] + utils.regions: infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv' self.branch_lengths[mtype] = {} self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], [] mutehist = plotting.make_hist_from_bin_entry_file(infname, mtype+'-mute-freqs') self.branch_lengths[mtype]['mean'] = mutehist.GetMean() if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0: print 'WARNING nonzero under/overflow bins read from %s' % infname check_sum = 0.0 for ibin in range(1, mutehist.GetNbinsX()+1): # ignore under/overflow bins freq = mutehist.GetBinCenter(ibin) branch_length = float(freq) prob = mutehist.GetBinContent(ibin) self.branch_lengths[mtype]['lengths'].append(branch_length) self.branch_lengths[mtype]['probs'].append(prob) check_sum += self.branch_lengths[mtype]['probs'][-1] assert utils.is_normed(check_sum) if self.args.debug: print ' mean branch lengths' for mtype in ['all',] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
def read_mute_freqs(self, mute_freq_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now for mtype in [ 'all', ] + utils.regions: infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv' self.branch_lengths[mtype] = {} self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype][ 'probs'] = [], [] mutehist = plotting.make_hist_from_bin_entry_file( infname, mtype + '-mute-freqs') self.branch_lengths[mtype]['mean'] = mutehist.GetMean() if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent( mutehist.GetNbinsX() + 1) > 0.0: print 'WARNING nonzero under/overflow bins read from %s' % infname check_sum = 0.0 for ibin in range(1, mutehist.GetNbinsX() + 1): # ignore under/overflow bins freq = mutehist.GetBinCenter(ibin) branch_length = float(freq) prob = mutehist.GetBinContent(ibin) self.branch_lengths[mtype]['lengths'].append(branch_length) self.branch_lengths[mtype]['probs'].append(prob) check_sum += self.branch_lengths[mtype]['probs'][-1] assert utils.is_normed(check_sum) if self.args.debug: print ' mean branch lengths' for mtype in [ 'all', ] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % ( mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
def __init__(self, base_indir, outdir, gene_name, naivety, germline_seqs, args, cyst_positions, tryp_positions): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = germline_seqs # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.cyst_positions = cyst_positions self.tryp_positions = tryp_positions # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 # self.allow_external_deletions = args.allow_external_deletions # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.naivety = naivety self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': if not self.args.dont_allow_unphysical_insertions: self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') if not self.args.dont_allow_unphysical_insertions: self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.insertion_content_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps mean_freq_hist = plotting.make_hist_from_bin_entry_file(self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = mean_freq_hist.GetMean()