def read_annotation_performance(self, version_stype, input_stype, debug=False): """ version_stype is the code version, while input_stype is the input data version, i.e. 'ref', 'new' is the reference code version (last commit) run on the then-new simulation and parameters""" ptest = 'annotate-' + input_stype + '-simu' if args.quick and ptest not in self.quick_tests: return if debug: print ' version %s input %s annotation' % (version_stype, input_stype) def read_performance_file(fname, column, only_ibin=None): values = [] with open(fname) as csvfile: reader = csv.DictReader(csvfile) ibin = 0 for line in reader: if only_ibin is not None and ibin != only_ibin: ibin += 1 continue values.append(float(line[column])) ibin += 1 if len(values) == 1: return values[0] else: return values perfdir = self.dirs[version_stype] + '/' + self.perfdirs[input_stype] for method in ['sw', 'hmm']: if debug: print ' ', method # fraction of genes correct for region in utils.regions: fraction_correct = read_performance_file( perfdir + '/' + method + '/gene-call/' + region + '_gene.csv', 'contents', only_ibin=1) if debug: print ' %s %.3f' % (region, fraction_correct) self.perf_info[version_stype][ input_stype + '-' + method + '-' + region + '_gene_correct'] = fraction_correct # hamming fraction hamming_hist = Hist(fname=perfdir + '/' + method + '/mutation/hamming_to_true_naive.csv') if debug: print ' mean hamming %.2f' % hamming_hist.get_mean() self.perf_info[version_stype][ input_stype + '-' + method + '-mean_hamming'] = hamming_hist.get_mean()
def read_mute_freqs(self, mute_freq_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now for mtype in ['all',] + utils.regions: infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv' self.branch_lengths[mtype] = {} self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], [] mutehist = Hist(fname=infname) self.branch_lengths[mtype]['mean'] = mutehist.get_mean() # if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0: # print 'WARNING nonzero under/overflow bins read from %s' % infname mutehist.normalize(include_overflows=False, overflow_eps_to_ignore=1e-2) # if it was written with overflows included, it'll need to be renormalized check_sum = 0.0 for ibin in range(1, mutehist.n_bins + 1): # ignore under/overflow bins freq = mutehist.get_bin_centers()[ibin] branch_length = self.convert_observed_changes_to_branch_length(float(freq)) prob = mutehist.bin_contents[ibin] self.branch_lengths[mtype]['lengths'].append(branch_length) self.branch_lengths[mtype]['probs'].append(prob) check_sum += self.branch_lengths[mtype]['probs'][-1] if not utils.is_normed(check_sum): raise Exception('not normalized %f' % check_sum) if self.args.debug: print ' mean branch lengths' for mtype in ['all',] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
def peruse_naive_seqs(): from hist import Hist # hall = Hist(n_set_list[-1], n_set_list[0] - 0.5, n_set_list[-1] + 0.5) means = [] for n_set in n_set_list: plotdir = baseplotdir + '/' + str(n_set) hist = Hist(fname=plotdir + '/hmm/hamming_to_true_naive.csv') print '%2d %.2f' % (n_set, hist.get_mean()), # hall.set_ibin(hall.find_bin(n_set), hist.get_mean()) means.append(hist.get_mean()) import plotting fig, ax = plotting.mpl_init() # hall.mpl_plot(ax) ax.plot(n_set_list, means, marker='.') plotting.mpl_finish(ax, baseplotdir, 'means', xlabel='N simultaneous seqs', ylabel='mean hamming to true naive', ybounds=(0, None))
def __init__(self, base_indir, outdir, gene_name, naivety, glfo, args): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.cyst_positions = glfo['cyst-positions'] self.tryp_positions = glfo['tryp-positions'] # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.naivety = naivety self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.insertion_content_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = mean_freq_hist.get_mean()
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, locus=self.args.locus, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def read_annotation_performance(self, version_stype, input_stype, debug=False): """ version_stype is the code version, while input_stype is the input data version, i.e. 'ref', 'new' is the reference code version (last commit) run on the then-new simulation and parameters""" ptest = "annotate-" + input_stype + "-simu" if args.quick and ptest not in self.quick_tests: return if debug: print " version %s input %s annotation" % (version_stype, input_stype) def read_performance_file(fname, column, only_ibin=None): values = [] with open(fname) as csvfile: reader = csv.DictReader(csvfile) ibin = 0 for line in reader: if only_ibin is not None and ibin != only_ibin: ibin += 1 continue values.append(float(line[column])) ibin += 1 if len(values) == 1: return values[0] else: return values perfdir = self.dirs[version_stype] + "/" + self.perfdirs[input_stype] for method in ["sw", "hmm"]: if debug: print " ", method # fraction of genes correct for region in utils.regions: fraction_correct = read_performance_file( perfdir + "/" + method + "/plots/" + region + "_gene.csv", "contents", only_ibin=1 ) if debug: print " %s %.3f" % (region, fraction_correct) self.perf_info[version_stype][ input_stype + "-" + method + "-" + region + "_gene_correct" ] = fraction_correct # hamming fraction hamming_hist = Hist(fname=perfdir + "/" + method + "/plots/hamming_to_true_naive.csv") if debug: print " mean hamming %.2f" % hamming_hist.get_mean() self.perf_info[version_stype][input_stype + "-" + method + "-mean_hamming"] = hamming_hist.get_mean()
def read_annotation_performance(self, version_stype, input_stype, debug=False): """ version_stype is the code version, while input_stype is the input data version, i.e. 'ref', 'new' is the reference code version (last commit) run on the then-new simulation and parameters""" ptest = 'annotate-' + input_stype + '-simu' if args.quick and ptest not in self.quick_tests: return if debug: print ' version %s input %s annotation' % (version_stype, input_stype) def read_performance_file(fname, column, only_ibin=None): values = [] with open(fname) as csvfile: reader = csv.DictReader(csvfile) ibin = 0 for line in reader: if only_ibin is not None and ibin != only_ibin: ibin += 1 continue values.append(float(line[column])) ibin += 1 if len(values) == 1: return values[0] else: return values perfdir = self.dirs[version_stype] + '/' + self.perfdirs[input_stype] for method in ['sw', 'hmm']: if debug: print ' ', method # fraction of genes correct for region in utils.regions: fraction_correct = read_performance_file(perfdir + '/' + method + '/gene-call/' + region + '_gene.csv', 'contents', only_ibin=1) if debug: print ' %s %.3f' % (region, fraction_correct) self.perf_info[version_stype][input_stype + '-' + method + '-' + region + '_gene_correct'] = fraction_correct # hamming fraction hamming_hist = Hist(fname=perfdir + '/' + method + '/mutation/hamming_to_true_naive.csv') if debug: print ' mean hamming %.2f' % hamming_hist.get_mean() self.perf_info[version_stype][input_stype + '-' + method + '-mean_hamming'] = hamming_hist.get_mean()
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][ gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = { r: glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items() } # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = { 'fv': 1.5, 'jf': 25 } # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.mute_freq_bounds = { 'lo': 0.01, 'hi': 0.5 } # don't let any position mutate less frequently than 1% of the time, or more frequently than half the time self.enforced_flat_mfreq_length = { # i.e. distance over which the mute freqs are typically screwed up. I'm not really sure why these vary so much, but it's probably to do with how the s-w step works 'v_3p' : 9, 'd_5p' : 9, 'd_3p' : 9, 'j_5p' : 20, } self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[ 0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count( self.indir, gene_name, debug=self.debug ) # how many times did we observe this gene in data? approved_genes = [gene_name] # NOTE this never happens any more, since partitiondriver.cache_parameters() resets <args.min_observations_to_write> if it's arger than 10*(number of sequences) if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average also over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % ( self.n_occurences, self.args.min_observations_to_write) approved_genes += utils.find_replacement_genes( self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(approved_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info( approved_genes) self.mute_freqs = paramutils.read_mute_freqs_with_weights( self.indir, approved_genes) # weighted averages over genes self.mute_counts = paramutils.read_mute_counts( self.indir, gene_name, self.args.locus) # raw per-{ACGT} counts self.process_mutation_info( ) # smooth/interpolation/whatnot for <self.mute_freqs> and <self.mute_counts> # NOTE i'm using a hybrid approach with mute_freqs and mute_counts -- the only thing I get from mute_counts is the ratios of the different bases, whereas the actual freq comes from mute_freqs (which has all the corrections/smooth/bullshit) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM( self.saniname, self.track.getdict() ) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max( self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name) ) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean() self.hmm.extras['per_gene_mute_freq'] = self.mute_freqs[ 'unweighted_overall_mean'] # the other (weighted) one might be technically more accurate, depending on what you want, but it's probably not what anyone is expecting, so we write the unweighted one