def check(self): total = 0.0 for _, prob in self.transitions.iteritems(): assert prob >= 0.0 total += prob assert utils.is_normed(total) if self.name == 'init': # no emissions for 'init' state return if self.emissions is not None: total = 0.0 for _, prob in self.emissions['probs'].iteritems(): assert prob >= 0.0 total += prob assert utils.is_normed(total) if self.pair_emissions is not None: total = 0.0 for letter1 in self.pair_emissions['probs']: for _, prob in self.pair_emissions['probs'][letter1].iteritems( ): assert prob >= 0.0 total += prob assert utils.is_normed(total)
def read_mute_freqs(self, parameter_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now branch_lengths = {} for mtype in [ 'all', ] + utils.regions: branch_lengths[mtype] = {n: [] for n in ('lengths', 'probs')} mutehist = self.get_mute_hist(mtype, parameter_dir) branch_lengths[mtype]['mean'] = mutehist.get_mean() mutehist.normalize( include_overflows=False, expect_overflows=True ) # if it was written with overflows included, it'll need to be renormalized check_sum = 0.0 for ibin in range(1, mutehist.n_bins + 1): # ignore under/overflow bins freq = mutehist.get_bin_centers()[ibin] branch_length = self.convert_observed_changes_to_branch_length( float(freq)) prob = mutehist.bin_contents[ibin] branch_lengths[mtype]['lengths'].append(branch_length) branch_lengths[mtype]['probs'].append(prob) check_sum += branch_lengths[mtype]['probs'][-1] if not utils.is_normed(check_sum): raise Exception('not normalized %f' % check_sum) return branch_lengths
def get_rescaled_trees(self, treestr, branch_length_ratios, debug=False): """ Trees are generated with the mean branch length observed in data over the whole sequence, because we want to use topologically the same tree for the whole sequence. But we observe different branch lengths for each region, so we need to rescale the tree for v, d, and j """ rescaled_trees = {} if debug: print ' rescaling tree:' for region in utils.regions: # rescale the tree rescaled_trees[region] = treegenerator.rescale_tree( treestr, branch_length_ratios[region]) if debug: print ' %s by %f (new depth %f): %s -> %s' % ( region, branch_length_ratios[region], treegenerator.get_leaf_node_depths( rescaled_trees[region])['t1'], treestr, rescaled_trees[region]) # and then check it NOTE can remove this eventually initial_depths = {} for node, depth in treegenerator.get_leaf_node_depths( treestr).items(): initial_depths[node] = depth for node, depth in treegenerator.get_leaf_node_depths( rescaled_trees[region]).items(): depth_ratio = depth / initial_depths[node] assert utils.is_normed(depth_ratio / branch_length_ratios[region], this_eps=1e-6) return rescaled_trees
def read_insertion_content(self, insertion): self.insertion_content_probs[insertion] = {} if self.args.insertion_base_content: with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[insertion][line[ insertion + '_insertion_content']] = int(line['count']) total += int(line['count']) for nuke in utils.nukes: if nuke not in self.insertion_content_probs[insertion]: print ' %s not in insertion content probs, adding with zero' % nuke self.insertion_content_probs[insertion][nuke] = 0 self.insertion_content_probs[insertion][nuke] /= float( total) else: self.insertion_content_probs[insertion] = { 'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25 } assert utils.is_normed(self.insertion_content_probs[insertion]) if self.args.debug: print ' insertion content for', insertion, self.insertion_content_probs[ insertion]
def read_vdj_version_freqs(self, fname): """ Read the frequencies at which various VDJ combinations appeared in data """ with opener('r')(fname) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted # # if int(line['cdr3_length']) == -1: # continue # couldn't find conserved codons when we were inferring things if self.args.only_genes != None: # are we restricting ourselves to a subset of genes? if line['v_gene'] not in self.args.only_genes: continue # oops, don't change this to a loop, 'cause you won't continue out of the right thing then if line['d_gene'] not in self.args.only_genes: continue if line['j_gene'] not in self.args.only_genes: continue total += float(line['count']) index = tuple(line[column] for column in utils.index_columns) assert index not in self.version_freq_table self.version_freq_table[index] = float(line['count']) if len(self.version_freq_table) == 0: print 'ERROR didn\'t find any matching gene combinations' assert False # then normalize test_total = 0.0 for index in self.version_freq_table: self.version_freq_table[index] /= total test_total += self.version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len( self.version_freq_table ) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
def read_vdj_version_freqs(self): """ Read the frequencies at which various VDJ combinations appeared in data """ if self.args.rearrange_from_scratch: return None version_freq_table = {} with open(self.reco_parameter_dir + '/' + utils.get_parameter_fname('all', 'r')) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted skip = False for region in utils.regions: if line[region + '_gene'] not in self.glfo['seqs'][region]: skip = True break if skip: continue total += float(line['count']) index = self.freqtable_index(line) assert index not in version_freq_table version_freq_table[index] = float(line['count']) if len(version_freq_table) == 0: raise Exception('didn\'t find any gene combinations in %s' % fname) # then normalize test_total = 0.0 for index in version_freq_table: version_freq_table[index] /= total test_total += version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len(version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently return version_freq_table
def read_insertion_content(self, insertion): icontentprobs = {} # NOTE this is only the probs for <insertion>, even though name is the same as in the previous function if insertion in utils.boundaries: # i.e. if it's a real insertion with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: icontentprobs[line[insertion + '_insertion_content']] = int(line['count']) total += int(line['count']) if total == 0. and self.debug: print '\n WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv' for nuke in utils.nukes: if total == 0.: icontentprobs[nuke] = 1. / len(utils.nukes) else: if nuke not in icontentprobs: print ' %s not in insertion content probs, adding with zero' % nuke icontentprobs[nuke] = 0 icontentprobs[nuke] /= float(total) else: # just return uniform probs for effective (fv and jf) insertions icontentprobs = {n : 0.25 for n in utils.nukes} assert utils.is_normed(icontentprobs) return icontentprobs
def read_mute_freqs(self, parameter_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now branch_lengths = {} for mtype in ['all',] + utils.regions: branch_lengths[mtype] = {n : [] for n in ('lengths', 'probs')} mutehist = self.get_mute_hist(mtype, parameter_dir) branch_lengths[mtype]['mean'] = mutehist.get_mean() mutehist.normalize(include_overflows=False, expect_overflows=True) # if it was written with overflows included, it'll need to be renormalized check_sum = 0.0 for ibin in range(1, mutehist.n_bins + 1): # ignore under/overflow bins freq = mutehist.get_bin_centers()[ibin] branch_length = self.convert_observed_changes_to_branch_length(float(freq)) prob = mutehist.bin_contents[ibin] branch_lengths[mtype]['lengths'].append(branch_length) branch_lengths[mtype]['probs'].append(prob) check_sum += branch_lengths[mtype]['probs'][-1] if not utils.is_normed(check_sum): raise Exception('not normalized %f' % check_sum) if self.args.debug: print ' mean branch lengths' for mtype in ['all',] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % (mtype, branch_lengths[mtype]['mean'], branch_lengths[mtype]['mean'] / branch_lengths['all']['mean']) return branch_lengths
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes is None: approved_genes = [this_gene, ] eprobs = {} genes_used = set() for erosion in utils.all_erosions: if erosion[0] != self.region: continue eprobs[erosion] = {} if this_gene == glutils.dummy_d_genes[self.args.locus]: eprobs[erosion][0] = 1. # always erode zero bases continue deps = utils.column_dependencies[erosion + '_del'] with open(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps), 'r') as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + '_del']) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + '_del']) if n_eroded not in eprobs[erosion]: eprobs[erosion][n_eroded] = 0.0 eprobs[erosion][n_eroded] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) if len(eprobs[erosion]) == 0: raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps))) # do some smoothingy things NOTE that we normalize *after* interpolating if erosion in utils.real_erosions: # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(eprobs[erosion]) # and finally, normalize total = 0.0 for _, val in eprobs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in eprobs[erosion]: eprobs[erosion][n_eroded] /= total test_total += eprobs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1 and self.debug: # if length is 1, we will have just used the actual gene print ' used erosion info from:', ' '.join(genes_used) return eprobs
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes is None: approved_genes = [this_gene, ] eprobs = {} genes_used = set() for erosion in utils.real_erosions + utils.effective_erosions: if erosion[0] != self.region: continue eprobs[erosion] = {} if this_gene == glutils.dummy_d_genes[self.args.chain]: eprobs[erosion][0] = 1. # always erode zero bases continue deps = utils.column_dependencies[erosion + '_del'] with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + '_del']) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + '_del']) if n_eroded not in eprobs[erosion]: eprobs[erosion][n_eroded] = 0.0 eprobs[erosion][n_eroded] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) if len(eprobs[erosion]) == 0: raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps))) # do some smoothingy things NOTE that we normalize *after* interpolating if erosion in utils.real_erosions: # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(eprobs[erosion]) # and finally, normalize total = 0.0 for _, val in eprobs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in eprobs[erosion]: eprobs[erosion][n_eroded] /= total test_total += eprobs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1 and self.debug: # if length is 1, we will have just used the actual gene print ' used erosion info from:', ' '.join(genes_used) return eprobs
def read_insertion_info(self, this_gene, approved_genes=None): if approved_genes == None: # if we aren't explicitly passed a list of genes to use, we just use the gene for which we're actually writing the hmm approved_genes = [this_gene,] genes_used = set() for insertion in self.insertions: self.insertion_probs[insertion] = {} deps = utils.column_dependencies[insertion + '_insertion'] with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=insertion + '_insertion', deps=deps)) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then add in this insertion's counts n_inserted = 0 n_inserted = int(line[insertion + '_insertion']) if n_inserted not in self.insertion_probs[insertion]: self.insertion_probs[insertion][n_inserted] = 0.0 self.insertion_probs[insertion][n_inserted] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) assert len(self.insertion_probs[insertion]) > 0 # print ' interpolate insertions' interpolate_bins(self.insertion_probs[insertion], self.n_max_to_interpolate, bin_eps=self.eps) #, max_bin=len(self.germline_seq)) # NOTE that we normalize *after* this if 0 not in self.insertion_probs[insertion] or len(self.insertion_probs[insertion]) < 2: # all hell breaks loose lower down if we haven't got shit in the way of information if self.args.debug: print ' WARNING adding pseudocount to 1-bin in insertion probs' self.insertion_probs[insertion][0] = 1 self.insertion_probs[insertion][1] = 1 if self.args.debug: print ' ', self.insertion_probs[insertion] assert 0 in self.insertion_probs[insertion] and len(self.insertion_probs[insertion]) >= 2 # all hell breaks loose lower down if we haven't got shit in the way of information # and finally, normalize total = 0.0 for _, val in self.insertion_probs[insertion].iteritems(): total += val test_total = 0.0 for n_inserted in self.insertion_probs[insertion]: self.insertion_probs[insertion][n_inserted] /= total test_total += self.insertion_probs[insertion][n_inserted] assert utils.is_normed(test_total) if 0 not in self.insertion_probs[insertion] or self.insertion_probs[insertion][0] == 1.0: print 'ERROR cannot have all or none of the probability mass in the zero bin:', self.insertion_probs[insertion] assert False # self.insertion_content_probs = {} self.read_insertion_content(insertion) # also read the base content of the insertions if len(genes_used) > 1: # if length is 1, we will have just used the actual gene if self.args.debug: print ' insertions used:', ' '.join(genes_used)
def read_insertion_content(self, insertion): self.insertion_content_probs[insertion] = {} if insertion in utils.boundaries: # just return uniform probs for fv and jf insertions with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[insertion][line[insertion + '_insertion_content']] = int(line['count']) total += int(line['count']) if total == 0.: print '\n WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv' for nuke in utils.nukes: if total == 0.: self.insertion_content_probs[insertion][nuke] = 1. / len(utils.nukes) else: if nuke not in self.insertion_content_probs[insertion]: print ' %s not in insertion content probs, adding with zero' % nuke self.insertion_content_probs[insertion][nuke] = 0 self.insertion_content_probs[insertion][nuke] /= float(total) else: self.insertion_content_probs[insertion] = {n : 0.25 for n in utils.nukes} assert utils.is_normed(self.insertion_content_probs[insertion]) if self.args.debug: print ' insertion content for', insertion, self.insertion_content_probs[insertion]
def check_tree_lengths(self, treefname, ages): treestrs = [] with opener('r')(treefname) as treefile: for line in treefile: treestrs.append(line.split(';')[0] + ';') # ignore the info I added after the ';' if self.args.debug > 1: print ' checking branch lengths... ' assert len(treestrs) == len(ages) total_length, total_leaves = 0.0, 0 for itree in range(len(ages)): if self.args.debug > 1: print ' asked for', ages[itree], for name, depth in get_leaf_node_depths(treestrs[itree]).items(): if self.args.debug > 1: print '%s:%f' % (name, depth), if not utils.is_normed(depth / ages[itree], this_eps=1e-6): raise Exception( 'asked for branch length %f but got %f\n %s' % (ages[itree], depth, treestrs[itree]) ) # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision total_length += ages[itree] total_leaves += len(re.findall('t', treestrs[itree])) if self.args.debug > 1: print '' if self.args.debug: print ' mean branch length %.5f' % (total_length / len(ages)) print ' mean n leaves %.2f' % (float(total_leaves) / len(ages))
def normalize(self, include_overflows=True, expect_empty=False, expect_overflows=False, overflow_eps_to_ignore=1e-15): sum_value = self.integral(include_overflows) imin, imax = self.get_bounds(include_overflows) if sum_value == 0.0: return if sum_value == 0.0: if not expect_empty: print 'WARNING sum zero in Hist::normalize()' return if not expect_overflows and not include_overflows and ( self.bin_contents[0] / sum_value > overflow_eps_to_ignore or self.bin_contents[self.n_bins + 1] / sum_value > overflow_eps_to_ignore): print 'WARNING under/overflows in Hist::normalize()' for ib in range(imin, imax): self.bin_contents[ib] /= sum_value if self.sum_weights_squared is not None: self.sum_weights_squared[ib] /= sum_value * sum_value if self.errors is not None: self.errors[ib] /= sum_value check_sum = 0.0 for ib in range(imin, imax): # check it check_sum += self.bin_contents[ib] if not is_normed(check_sum, this_eps=1e-10): raise Exception('not normalized: %f' % check_sum)
def read_vdj_version_freqs(self, fname): """ Read the frequencies at which various VDJ combinations appeared in data """ with opener('r')(fname) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted # # if int(line['cdr3_length']) == -1: # continue # couldn't find conserved codons when we were inferring things if self.args.only_genes is not None: # are we restricting ourselves to a subset of genes? if line['v_gene'] not in self.args.only_genes: continue if line['d_gene'] not in self.args.only_genes: continue if line['j_gene'] not in self.args.only_genes: continue total += float(line['count']) index = tuple(line[column] for column in utils.index_columns) assert index not in self.version_freq_table self.version_freq_table[index] = float(line['count']) if len(self.version_freq_table) == 0: print 'ERROR didn\'t find any matching gene combinations' assert False # then normalize test_total = 0.0 for index in self.version_freq_table: self.version_freq_table[index] /= total test_total += self.version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len(self.version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
def normalize( self, overflow_warn=True ): # since when you normalize hists you have to make the arbitrary decision whether you're going to include the under/overflow bins (we don't include them here), in general we prefer to avoid having under/overflow entries """ NOTE does not multiply/divide by bin widths """ sum_value = 0.0 for ib in range(1, self.n_bins + 1): # don't include under/overflows sum_value += self.bin_contents[ib] if sum_value == 0.0: print 'WARNING sum zero in Hist::normalize(), returning without doing anything' return # make sure there's not too much stuff in the under/overflows if overflow_warn and ( self.bin_contents[0] / sum_value > 1e-10 or self.bin_contents[self.n_bins + 1] / sum_value > 1e-10): print 'WARNING under/overflows in Hist::normalize()' for ib in range(1, self.n_bins + 1): self.bin_contents[ib] /= sum_value if self.sum_weights_squared is not None: self.sum_weights_squared[ib] /= sum_value * sum_value if self.errors is not None: self.errors[ib] /= sum_value check_sum = 0.0 for ib in range(1, self.n_bins + 1): # check it check_sum += self.bin_contents[ib] assert is_normed(check_sum, this_eps=1e-10)
def read_mute_freqs(self, mute_freq_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now for mtype in ['all',] + utils.regions: infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv' self.branch_lengths[mtype] = {} self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], [] mutehist = Hist(fname=infname) self.branch_lengths[mtype]['mean'] = mutehist.get_mean() # if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0: # print 'WARNING nonzero under/overflow bins read from %s' % infname mutehist.normalize(include_overflows=False, overflow_eps_to_ignore=1e-2) # if it was written with overflows included, it'll need to be renormalized check_sum = 0.0 for ibin in range(1, mutehist.n_bins + 1): # ignore under/overflow bins freq = mutehist.get_bin_centers()[ibin] branch_length = self.convert_observed_changes_to_branch_length(float(freq)) prob = mutehist.bin_contents[ibin] self.branch_lengths[mtype]['lengths'].append(branch_length) self.branch_lengths[mtype]['probs'].append(prob) check_sum += self.branch_lengths[mtype]['probs'][-1] if not utils.is_normed(check_sum): raise Exception('not normalized %f' % check_sum) if self.args.debug: print ' mean branch lengths' for mtype in ['all',] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
def read_insertion_content(self): self.insertion_content_probs = {} for bound in utils.boundaries: self.insertion_content_probs[bound] = {} if self.args.insertion_base_content: with opener('r')(self.args.parameter_dir + '/' + bound + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[bound][line[ bound + '_insertion_content']] = int(line['count']) total += int(line['count']) for nuke in utils.nukes: if nuke not in self.insertion_content_probs[bound]: print ' %s not in insertion content probs, adding with zero' % nuke self.insertion_content_probs[bound][nuke] = 0 self.insertion_content_probs[bound][nuke] /= float( total) else: self.insertion_content_probs[bound] = { 'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25 } assert utils.is_normed(self.insertion_content_probs[bound])
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes == None: approved_genes = [this_gene] genes_used = set() for erosion in utils.real_erosions + utils.effective_erosions: if erosion[0] != self.region: continue self.erosion_probs[erosion] = {} deps = utils.column_dependencies[erosion + "_del"] with opener("r")( self.indir + "/" + utils.get_parameter_fname(column=erosion + "_del", deps=deps) ) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if ( self.region + "_gene" in line and line[self.region + "_gene"] not in approved_genes ): # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + "_del"]) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + "_del"]) if n_eroded not in self.erosion_probs[erosion]: self.erosion_probs[erosion][n_eroded] = 0.0 self.erosion_probs[erosion][n_eroded] += float(line["count"]) if self.region + "_gene" in line: genes_used.add(line[self.region + "_gene"]) assert len(self.erosion_probs[erosion]) > 0 # do some smoothingy things NOTE that we normalize *after* interpolating if ( erosion in utils.real_erosions ): # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(self.erosion_probs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(self.erosion_probs[erosion]) # and finally, normalize total = 0.0 for _, val in self.erosion_probs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in self.erosion_probs[erosion]: self.erosion_probs[erosion][n_eroded] /= total test_total += self.erosion_probs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1: # if length is 1, we will have just used the actual gene if self.args.debug: print " erosions used:", " ".join(genes_used)
def read_vdj_version_freqs(self): """ Read the frequencies at which various VDJ combinations appeared in data """ if self.args.rearrange_from_scratch: return None version_freq_table = {} with opener('r')(self.parameter_dir + '/' + utils.get_parameter_fname('all')) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted skip = False for region in utils.regions: if line[region + '_gene'] not in self.glfo['seqs'][region]: skip = True break if skip: continue total += float(line['count']) index = self.freqtable_index(line) assert index not in version_freq_table version_freq_table[index] = float(line['count']) if len(version_freq_table) == 0: raise Exception('didn\'t find any gene combinations in %s' % fname) # then normalize test_total = 0.0 for index in version_freq_table: version_freq_table[index] /= total test_total += version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len(version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently return version_freq_table
def read_insertion_content(self, insertion): icontentprobs = { } # NOTE this is only the probs for <insertion>, even though name is the same as in the previous function if insertion in utils.boundaries: # i.e. if it's a real insertion with open(self.indir + '/' + insertion + '_insertion_content.csv', 'r') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: icontentprobs[line[insertion + '_insertion_content']] = int( line['count']) total += int(line['count']) if total == 0. and self.debug: print '\n WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv' for nuke in utils.nukes: if total == 0.: icontentprobs[nuke] = 1. / len(utils.nukes) else: if nuke not in icontentprobs: print ' %s not in insertion content probs, adding with zero' % nuke icontentprobs[nuke] = 0 icontentprobs[nuke] /= float(total) else: # just return uniform probs for effective (fv and jf) insertions icontentprobs = {n: 0.25 for n in utils.nukes} assert utils.is_normed(icontentprobs) return icontentprobs
def read_mute_freqs(self, mute_freq_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now for mtype in ['all',] + utils.regions: infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv' self.branch_lengths[mtype] = {} self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], [] mutehist = plotting.make_hist_from_bin_entry_file(infname, mtype+'-mute-freqs') self.branch_lengths[mtype]['mean'] = mutehist.GetMean() if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0: print 'WARNING nonzero under/overflow bins read from %s' % infname check_sum = 0.0 for ibin in range(1, mutehist.GetNbinsX()+1): # ignore under/overflow bins freq = mutehist.GetBinCenter(ibin) branch_length = float(freq) prob = mutehist.GetBinContent(ibin) self.branch_lengths[mtype]['lengths'].append(branch_length) self.branch_lengths[mtype]['probs'].append(prob) check_sum += self.branch_lengths[mtype]['probs'][-1] assert utils.is_normed(check_sum) if self.args.debug: print ' mean branch lengths' for mtype in ['all',] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
def read_insertion_content(self): if self.args.rearrange_from_scratch: return { b: {n: 1. / len(utils.nukes) for n in utils.nukes} for b in utils.boundaries } insertion_content_probs = {} for bound in utils.boundaries: insertion_content_probs[bound] = {} with open( self.parameter_dir + '/' + bound + '_insertion_content.csv', 'r') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: insertion_content_probs[bound][line[ bound + '_insertion_content']] = int(line['count']) total += int(line['count']) for nuke in utils.nukes: if nuke not in insertion_content_probs[bound]: print ' %s not in insertion content probs, adding with zero' % nuke insertion_content_probs[bound][nuke] = 0 insertion_content_probs[bound][nuke] /= float(total) assert utils.is_normed(insertion_content_probs[bound]) return insertion_content_probs
def write_mute_freqs(self, region, gene_or_insert_name, seq, reco_event, reco_seq_fname, is_insertion=False): """ Read position-by-position mute freqs from disk for <gene_or_insert_name>, renormalize, then write to a file for bppseqgen. """ mute_freqs = self.get_mute_freqs(gene_or_insert_name) rates = [] # list with a relative mutation rate for each position in <seq> total = 0.0 # assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to... left_erosion_length = dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p'] for inuke in range(len(seq)): # append a freq for each nuke position = inuke + left_erosion_length freq = 0.0 if position in mute_freqs: freq = mute_freqs[position] else: freq = mute_freqs['overall_mean'] rates.append(freq) total += freq # normalize to the number of sites (i.e. so an average site is given value 1.0) assert total != 0.0 # I am not hip enough to divide by zero for inuke in range(len(seq)): rates[inuke] *= float(len(seq)) / total total = 0.0 # and... double check it, just for shits and giggles for inuke in range(len(seq)): total += rates[inuke] assert utils.is_normed(total / float(len(seq))) assert len(rates) == len(seq) # you just can't be too careful. what if gremlins ate a few while python wasn't looking? # write the input file for bppseqgen, one base per line with opener('w')(reco_seq_fname) as reco_seq_file: reco_seq_file.write('state\trate\n') for inuke in range(len(seq)): reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
def check(self): total = 0.0 for _, prob in self.transitions.iteritems(): assert prob >= 0.0 total += prob if not utils.is_normed(total): raise Exception('transition probs not normed in %s: %s' % (self.name, self.transitions)) if self.name == 'init': # no emissions for 'init' state return if self.emissions is not None: total = 0.0 for _, prob in self.emissions['probs'].iteritems(): assert prob >= 0.0 total += prob assert utils.is_normed(total)
def add_region_entry_transitions(self, state, insertion): """ Add transitions *into* the v, d, or j regions. Called on either the 'init' state or the 'insert_left' state. For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene. For d and j, this is (mostly) the prob to actually erode on the left side. The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off. """ assert 'jf' not in insertion # need these to only be *left*-hand insertions assert state.name == 'init' or 'insert' in state.name # first add transitions to the insert state region_entry_prob = 0.0 # Prob to go to an internal germline state (i.e. not to an insert state) if state.name == 'init': if insertion == '': region_entry_prob = 1.0 # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states) else: region_entry_prob = self.get_zero_length_insertion_prob(insertion) # prob of entering the region from 'init' is the prob of a zero-length insertion elif 'insert' in state.name: region_entry_prob = 1.0 - self.get_insert_self_transition_prob(insertion) # the 'insert_left' state has to either go to itself, or else enter the region else: assert False # If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion # Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length> # update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions) if insertion != '' and region_entry_prob < 1.0: if not insertion in utils.boundaries: nukelist = ['N', ] else: nukelist = utils.nukes for nuke in nukelist: content_prob = 1. if nuke == 'N' else self.insertion_content_probs[insertion][nuke] state.add_transition('insert_left_' + nuke, (1.0 - region_entry_prob) * content_prob) # then add transitions to the region's internal states total = 0.0 if self.region == 'v': # only add a transition to the zeroth internal state state.add_transition('%s_%d' % (self.saniname, 0), region_entry_prob) total += region_entry_prob self.smallest_entry_index = 0 else: erosion = self.region + '_5p' for inuke in range(len(self.germline_seq)): erosion_length = inuke if erosion_length in self.erosion_probs[erosion]: prob = self.erosion_probs[erosion][erosion_length] total += prob * region_entry_prob if region_entry_prob != 0.0: # only add the line if there's a chance of entering the region from this state state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob) if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index: # tells us where we need to start adding internal states (the smallest internal state index we add is the first one that has nonzero transition probability here) self.smallest_entry_index = inuke else: assert state.name == 'init' or self.raw_name == glutils.dummy_d_genes[self.args.locus] # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state (UPDATE: or, it can be the dummy d) if region_entry_prob != 0.0 and not utils.is_normed(total / region_entry_prob): raise Exception('normalization problem in add_region_entry_transitions():\n region_entry_prob: %f total / region_entry_prob: %f' % (region_entry_prob, total / region_entry_prob))
def add_region_entry_transitions(self, state, insertion): """ Add transitions *into* the v, d, or j regions. Called from either the 'init' state or the 'insert_left' state. For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene. For d and j, this is (mostly) the prob to actually erode on the left side. The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off. """ assert 'jf' not in insertion # need these to only be *left*-hand insertions assert state.name == 'init' or 'insert' in state.name # first add transitions to the insert state region_entry_prob = 0.0 # Prob to go to an internal germline state (i.e. not to an insert state) if state.name == 'init': if insertion == '': region_entry_prob = 1.0 # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states) else: region_entry_prob = self.get_zero_length_insertion_prob(insertion) # prob of entering the region from 'init' is the prob of a zero-length insertion elif 'insert' in state.name: region_entry_prob = 1.0 - self.get_insert_self_transition_prob(insertion) # the 'insert_left' state has to either go to itself, or else enter the region else: assert False # If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion # Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length> # update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions) if insertion != '': if not insertion in utils.boundaries: nukelist = ['N', ] else: nukelist = utils.nukes for nuke in nukelist: content_prob = 1. if nuke == 'N' else self.insertion_content_probs[insertion][nuke] state.add_transition('insert_left_' + nuke, (1.0 - region_entry_prob) * content_prob) # then add transitions to the region's internal states total = 0.0 if self.region == 'v': # only add a transition to the zeroth internal state state.add_transition('%s_%d' % (self.saniname, 0), region_entry_prob) total += region_entry_prob self.smallest_entry_index = 0 else: erosion = self.region + '_5p' for inuke in range(len(self.germline_seq)): erosion_length = inuke if erosion_length in self.erosion_probs[erosion]: prob = self.erosion_probs[erosion][erosion_length] total += prob * region_entry_prob if region_entry_prob != 0.0: # only add the line if there's a chance of entering the region from this state state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob) if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index: # tells us where we need to start adding internal states (the smallest internal state index we add is the first one that has nonzero transition probability here) self.smallest_entry_index = inuke else: assert state.name == 'init' # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state if region_entry_prob != 0.0 and not utils.is_normed(total / region_entry_prob): raise Exception('normalization problem in add_region_entry_transitions():\n region_entry_prob: %f total / region_entry_prob: %f' % (region_entry_prob, total / region_entry_prob))
def choose_allele_prevalence_freqs(glfo, allele_prevalence_freqs, region, min_allele_prevalence_freq, debug=False): n_alleles = len(glfo['seqs'][region]) prevalence_counts = numpy.random.randint(1, int(1. / min_allele_prevalence_freq), size=n_alleles) # ensures that each pair of alleles has a prevalence ratio between <min_allele_prevalence_freq> and 1. NOTE it's inclusive prevalence_freqs = [float(c) / sum(prevalence_counts) for c in prevalence_counts] allele_prevalence_freqs[region] = {g : f for g, f in zip(glfo['seqs'][region].keys(), prevalence_freqs)} assert utils.is_normed(allele_prevalence_freqs[region]) if debug: print ' counts %s' % ' '.join([('%5d' % c) for c in prevalence_counts]) print ' freqs %s' % ' '.join([('%5.3f' % c) for c in prevalence_freqs]) print ' min ratio %.3f' % (min(prevalence_freqs) / max(prevalence_freqs))
def add_region_entry_transitions(self, state, insertion): """ Add transitions *into* the v, d, or j regions. Called from either the 'init' state or the 'insert_left' state. For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene. For d and j, this is (mostly) the prob to actually erode on the left side. The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off. """ assert 'jf' not in insertion # need these to only be *left*-hand insertions assert state.name == 'init' or 'insert' in state.name region_entry_prob = 0.0 # Prob to go directly into the region (i.e. with no insertion) # The sum of the region entry probs must be (1 - non_zero_insertion_prob) for d and j # (i.e. such that [prob of transitions to insert] + [prob of transitions *not* to insert] is 1.0) # first add transitions to the insert state if state.name == 'init': if insertion == '': region_entry_prob = 1.0 # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states) else: region_entry_prob = self.insertion_probs[insertion][ 0] # prob of entering the region from 'init' is the prob of a zero-length insertion elif 'insert' in state.name: region_entry_prob = 1.0 - self.get_insert_self_transition_prob( insertion ) # the 'insert_left' state has to either go to itself, or else enter the region else: assert False # If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion # Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length> # update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions) if insertion != '': for nuke in utils.nukes: state.add_transition( 'insert_left_' + nuke, (1.0 - region_entry_prob) * self.insertion_content_probs[insertion][nuke]) # then add transitions to the region's internal states erosion = self.region + '_5p' total = 0.0 for inuke in range(len(self.germline_seq)): erosion_length = inuke if erosion_length in self.erosion_probs[erosion]: prob = self.erosion_probs[erosion][erosion_length] total += prob * region_entry_prob if region_entry_prob != 0.0: # only add the line if there's a chance of entering the region from this state state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob) if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index: self.smallest_entry_index = inuke else: assert state.name == 'init' # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state assert region_entry_prob == 0.0 or utils.is_normed( total / region_entry_prob)
def choose_allele_prevalence_freqs(glfo, allele_prevalence_freqs, region, min_allele_prevalence_freq, debug=False): n_alleles = len(glfo["seqs"][region]) prevalence_counts = numpy.random.randint( 1, int(1.0 / min_allele_prevalence_freq), size=n_alleles ) # ensures that each pair of alleles has a prevalence ratio between <min_allele_prevalence_freq> and 1. NOTE it's inclusive prevalence_freqs = [float(c) / sum(prevalence_counts) for c in prevalence_counts] allele_prevalence_freqs[region] = {g: f for g, f in zip(glfo["seqs"][region].keys(), prevalence_freqs)} assert utils.is_normed(allele_prevalence_freqs[region]) if debug: print " counts %s" % " ".join([("%5d" % c) for c in prevalence_counts]) print " freqs %s" % " ".join([("%5.3f" % c) for c in prevalence_freqs]) print " min ratio %.3f" % (min(prevalence_freqs) / max(prevalence_freqs))
def set_branch_lengths(self, parameter_dir): self.branch_lengths = {} for mtype in ['all'] + utils.regions: hist = self.get_mute_hist(mtype, parameter_dir) hist.normalize(include_overflows=False, expect_overflows=True) # if it was written with overflows included, it'll need to be renormalized lengths, probs = [], [] for ibin in range(1, hist.n_bins + 1): # ignore under/overflow bins freq = hist.get_bin_centers()[ibin] lengths.append(self.convert_observed_changes_to_branch_length(float(freq))) probs.append(hist.bin_contents[ibin]) self.branch_lengths[mtype] = {'mean' : hist.get_mean(), 'lengths' : lengths, 'probs' : probs} if not utils.is_normed(probs): raise Exception('not normalized %f' % check_sum)
def write_mute_freqs( self, gene, seq, reco_event, reco_seq_fname ): # TODO unsurprisingly, this function profiles out to be kind of a dumb way to do it, in terms of run time """ Read position-by-position mute freqs from disk for <gene>, renormalize, then write to a file for bppseqgen. """ mute_freqs = self.get_mute_freqs(gene) rates = [ ] # list with a relative mutation rate for each position in <seq> total = 0.0 # assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to... left_erosion_length = dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[ utils.get_region(gene) + '_5p'] for inuke in range(len(seq)): # append a freq for each nuke position = inuke + left_erosion_length freq = 0.0 if position in mute_freqs: freq = mute_freqs[position] else: freq = mute_freqs['overall_mean'] rates.append(freq) total += freq # normalize to the number of sites (i.e. so an average site is given value 1.0) assert total != 0.0 # I am not hip enough to divide by zero for inuke in range(len(seq)): rates[inuke] *= float(len(seq)) / total total = 0.0 # and... double check it, just for shits and giggles for inuke in range(len(seq)): total += rates[inuke] assert utils.is_normed(total / float(len(seq))) assert len(rates) == len( seq ) # you just can't be too careful. what if gremlins ate a few while python wasn't looking? # write the input file for bppseqgen, one base per line with open(reco_seq_fname, 'w') as reco_seq_file: # NOTE really not sure why this doesn't really [seems to require an "extra" column] work with csv.DictWriter, but it doesn't -- bppseqgen barfs (I think maybe it expects a different newline character? don't feel like working it out) headstr = 'state' if not self.args.mutate_from_scratch: headstr += '\trate' reco_seq_file.write(headstr + '\n') for inuke in range(len(seq)): linestr = seq[inuke] if not self.args.mutate_from_scratch: linestr += '\t%f' % rates[inuke] reco_seq_file.write(linestr + '\n')
def simulate(args): if utils.output_exists(args, args.simfname): return cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters' if args.n_leaf_distribution is None: cmd_str += ' --constant-number-of-leaves' else: cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution if args.mut_mult is not None: cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) if args.root_mrca_weibull_parameter is not None: cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv' # figure what genes we're using if args.gls_gen: assert args.sim_v_genes is None and args.allele_prevalence_freqs is None sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus) glutils.remove_v_genes_with_bad_cysteines(sglfo) glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname else: sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes)) added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes)) # NOTE template gene removal is the default for glutils.generate_germline_set if args.allele_prevalence_freqs is not None: if not utils.is_normed(args.allele_prevalence_freqs): raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs) if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): # already checked when parsing args, but, you know... raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v']))) gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}} glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' # glutils.print_glfo(sglfo) # run simulation if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) utils.simplerun(cmd_str, dryrun=args.dry_run)
def read_allele_prevalence_freqs(fname, debug=False): # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense allele_prevalence_freqs = {r: {} for r in utils.regions} with open(fname) as pfile: reader = csv.DictReader(pfile) for line in reader: allele_prevalence_freqs[utils.get_region(line["gene"])][line["gene"]] = float(line["freq"]) for region in utils.regions: if len(allele_prevalence_freqs[region]) == 0: continue if debug: for gene, freq in allele_prevalence_freqs[region].items(): print "%14.8f %s" % (freq, utils.color_gene(gene)) assert utils.is_normed(allele_prevalence_freqs[region]) return allele_prevalence_freqs
def check(self): total = 0.0 for _, prob in self.transitions.iteritems(): assert prob >= 0.0 total += prob assert utils.is_normed(total) if self.name == 'init': # no emissions for 'init' state return if self.emissions is not None: total = 0.0 for _, prob in self.emissions['probs'].iteritems(): assert prob >= 0.0 total += prob assert utils.is_normed(total) if self.pair_emissions is not None: total = 0.0 for letter1 in self.pair_emissions['probs']: for _, prob in self.pair_emissions['probs'][letter1].iteritems(): assert prob >= 0.0 total += prob assert utils.is_normed(total)
def rescale_tree(treestr, new_height, debug=False): """ rescale the branch lengths in <treestr> (newick-formatted) by <factor> """ tree = get_btree(treestr) mean_height = get_mean_height(treestr) for ln in tree.Objects: old_length = ln.length ln.length *= new_height / mean_height # rescale every branch length in the tree by the ratio of desired to existing height (everybody's heights should be the same... but they never quite were when I was using Bio.Phylo, so, uh. yeah, uh. not sure what to do, but this is fine. It's checked below, anyway) if debug: print ' %5s %7e --> %7e' % (ln.numName if ln.branchType == 'leaf' else ln.branchType, old_length, ln.length) tree.traverse_tree() treestr = tree.toString(numName=True) for leaf in get_btree(treestr).leaves: # make sure string conversion (and rescaling) went ok if not utils.is_normed(leaf.height / new_height, this_eps=1e-8): raise Exception('tree not rescaled properly: %.10f %.10f %e' % (leaf.height, new_height, (leaf.height - new_height) / new_height)) return treestr
def read_allele_prevalence_freqs(fname, debug=False): # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense allele_prevalence_freqs = {r : {} for r in utils.regions} with open(fname) as pfile: reader = csv.DictReader(pfile) for line in reader: allele_prevalence_freqs[utils.get_region(line['gene'])][line['gene']] = float(line['freq']) for region in utils.regions: if len(allele_prevalence_freqs[region]) == 0: continue if debug: for gene, freq in allele_prevalence_freqs[region].items(): print '%14.8f %s' % (freq, utils.color_gene(gene)) assert utils.is_normed(allele_prevalence_freqs[region]) return allele_prevalence_freqs
def add_region_entry_transitions(self, state, insertion): """ Add transitions *into* the v, d, or j regions. Called from either the 'init' state or the 'insert_left' state. For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene. For d and j, this is (mostly) the prob to actually erode on the left side. The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off. """ assert 'jf' not in insertion # need these to only be *left*-hand insertions assert state.name == 'init' or 'insert' in state.name region_entry_prob = 0.0 # Prob to go directly into the region (i.e. with no insertion) # The sum of the region entry probs must be (1 - non_zero_insertion_prob) for d and j # (i.e. such that [prob of transitions to insert] + [prob of transitions *not* to insert] is 1.0) # first add transitions to the insert state if state.name == 'init': if insertion == '': region_entry_prob = 1.0 # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states) else: region_entry_prob = self.insertion_probs[insertion][0] # prob of entering the region from 'init' is the prob of a zero-length insertion elif 'insert' in state.name: region_entry_prob = 1.0 - self.get_insert_self_transition_prob(insertion) # the 'insert_left' state has to either go to itself, or else enter the region else: assert False # If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion # Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length> # update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions) if insertion != '': for nuke in utils.nukes: state.add_transition('insert_left_' + nuke, (1.0 - region_entry_prob) * self.insertion_content_probs[insertion][nuke]) # then add transitions to the region's internal states erosion = self.region + '_5p' total = 0.0 for inuke in range(len(self.germline_seq)): erosion_length = inuke if erosion_length in self.erosion_probs[erosion]: prob = self.erosion_probs[erosion][erosion_length] total += prob * region_entry_prob if region_entry_prob != 0.0: # only add the line if there's a chance of entering the region from this state state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob) if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index: self.smallest_entry_index = inuke else: assert state.name == 'init' # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state assert region_entry_prob == 0.0 or utils.is_normed(total / region_entry_prob)
def read_insertion_content(self): self.insertion_content_probs = {} for bound in utils.boundaries: self.insertion_content_probs[bound] = {} with opener('r')(self.args.parameter_dir + '/' + bound + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[bound][line[bound + '_insertion_content']] = int(line['count']) total += int(line['count']) for nuke in utils.nukes: if nuke not in self.insertion_content_probs[bound]: print ' %s not in insertion content probs, adding with zero' % nuke self.insertion_content_probs[bound][nuke] = 0 self.insertion_content_probs[bound][nuke] /= float(total) assert utils.is_normed(self.insertion_content_probs[bound])
def normalize(self): sum_value = 0.0 for ib in range(1, self.n_bins + 1): # don't include under/overflows in sum_value sum_value += self.bin_contents[ib] if sum_value == 0.0: print 'WARNING sum zero in Hist::normalize, returning without doing anything' return # make sure there's not too much stuff in the under/overflows if self.bin_contents[0]/sum_value > 1e-10 or self.bin_contents[self.n_bins+1]/sum_value > 1e-10: print 'WARNING under/overflows' for ib in range(1, self.n_bins + 1): self.bin_contents[ib] /= sum_value if self.sum_weights_squared is not None: self.sum_weights_squared[ib] /= sum_value*sum_value check_sum = 0.0 for ib in range(1, self.n_bins + 1): # check it check_sum += self.bin_contents[ib] assert is_normed(check_sum, this_eps=1e-10)
def get_rescaled_trees(self, treestr, branch_length_ratios): """ Trees are generated with the mean branch length observed in data over the whole sequence, because we want to use topologically the same tree for the whole sequence. But we observe different branch lengths for each region, so we need to rescale the tree for v, d, and j """ rescaled_trees = {} for region in utils.regions: # rescale the tree rescaled_trees[region] = treegenerator.rescale_tree(treestr, branch_length_ratios[region]) # print 'rescaled %s by %f: %s -> %s' % (region, branch_length_ratios[region], treestr, rescaled_trees[region]) # and then check it NOTE can remove this eventually initial_depths = {} for node, depth in treegenerator.get_leaf_node_depths(treestr).items(): initial_depths[node] = depth for node, depth in treegenerator.get_leaf_node_depths(rescaled_trees[region]).items(): depth_ratio = depth / initial_depths[node] assert utils.is_normed(depth_ratio / branch_length_ratios[region], this_eps=1e-6) return rescaled_trees
def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False): """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """ replacement_genes = None if is_insertion: replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v') else: n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? if n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us # print ' only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences) replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False) mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes) rates = [] # list with a relative mutation rate for each position in <seq> total = 0.0 # assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to... for inuke in range(len(seq)): # append a freq for each nuke position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p'] freq = 0.0 if position in mute_freqs: freq = mute_freqs[position] else: freq = mute_freqs['overall_mean'] rates.append(freq) total += freq # normalize to the number of sites (i.e. so an average site is given value 1.0) assert total != 0.0 # I am not hip enough to divide by zero for inuke in range(len(seq)): rates[inuke] *= float(len(seq)) / total total = 0.0 # and... double check it, just for shits and giggles for inuke in range(len(seq)): total += rates[inuke] assert utils.is_normed(total / float(len(seq))) assert len(rates) == len(seq) # you just can't be too careful. what if gremlins ate a few while python wasn't looking? # write the input file for bppseqgen, one base per line with opener('w')(reco_seq_fname) as reco_seq_file: reco_seq_file.write('state\trate\n') for inuke in range(len(seq)): reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
def write_mute_freqs(self, gene, seq, reco_event, reco_seq_fname): """ Read position-by-position mute freqs from disk for <gene>, renormalize, then write to a file for bppseqgen. """ mute_freqs = self.get_mute_freqs(gene) rates = [] # list with a relative mutation rate for each position in <seq> total = 0.0 # assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to... left_erosion_length = dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[utils.get_region(gene) + '_5p'] for inuke in range(len(seq)): # append a freq for each nuke position = inuke + left_erosion_length freq = 0.0 if position in mute_freqs: freq = mute_freqs[position] else: freq = mute_freqs['overall_mean'] rates.append(freq) total += freq # normalize to the number of sites (i.e. so an average site is given value 1.0) assert total != 0.0 # I am not hip enough to divide by zero for inuke in range(len(seq)): rates[inuke] *= float(len(seq)) / total total = 0.0 # and... double check it, just for shits and giggles for inuke in range(len(seq)): total += rates[inuke] assert utils.is_normed(total / float(len(seq))) assert len(rates) == len(seq) # you just can't be too careful. what if gremlins ate a few while python wasn't looking? # write the input file for bppseqgen, one base per line with opener('w')(reco_seq_fname) as reco_seq_file: # NOTE really not sure why this doesn't really [seems to require an "extra" column] work with csv.DictWriter, but it doesn't -- bppseqgen barfs (I think maybe it expects a different newline character? don't feel like working it out) headstr = 'state' if not self.args.mutate_from_scratch: headstr += '\trate' reco_seq_file.write(headstr + '\n') for inuke in range(len(seq)): linestr = seq[inuke] if not self.args.mutate_from_scratch: linestr += '\t%f' % rates[inuke] reco_seq_file.write(linestr + '\n')
def read_insertion_content(self, insertion): self.insertion_content_probs[insertion] = {} if self.args.insertion_base_content: with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[insertion][line[insertion + '_insertion_content']] = int(line['count']) total += int(line['count']) for nuke in utils.nukes: if nuke not in self.insertion_content_probs[insertion]: print ' %s not in insertion content probs, adding with zero' % nuke self.insertion_content_probs[insertion][nuke] = 0 self.insertion_content_probs[insertion][nuke] /= float(total) else: self.insertion_content_probs[insertion] = {'A':0.25, 'C':0.25, 'G':0.25, 'T':0.25} assert utils.is_normed(self.insertion_content_probs[insertion]) if self.args.debug: print ' insertion content for', insertion, self.insertion_content_probs[insertion]
def read_insertion_content(self, insertion): self.insertion_content_probs[insertion] = {} if insertion in utils.boundaries: # just return uniform probs for fv and jf insertions with opener("r")(self.indir + "/" + insertion + "_insertion_content.csv") as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[insertion][line[insertion + "_insertion_content"]] = int(line["count"]) total += int(line["count"]) for nuke in utils.nukes: if nuke not in self.insertion_content_probs[insertion]: print " %s not in insertion content probs, adding with zero" % nuke self.insertion_content_probs[insertion][nuke] = 0 self.insertion_content_probs[insertion][nuke] /= float(total) else: self.insertion_content_probs[insertion] = {n: 0.25 for n in utils.nukes} assert utils.is_normed(self.insertion_content_probs[insertion]) if self.args.debug: print " insertion content for", insertion, self.insertion_content_probs[insertion]
def check_tree_lengths(self, treefname, ages): treestrs = [] with opener('r')(treefname) as treefile: for line in treefile: treestrs.append(line.split(';')[0] + ';') # ignore the info I added after the ';' if self.args.debug > 1: print ' checking branch lengths... ' assert len(treestrs) == len(ages) total = 0.0 for itree in range(len(ages)): if self.args.debug > 1: print ' asked for', ages[itree], for name, depth in get_leaf_node_depths(treestrs[itree]).items(): if self.args.debug > 1: print '%s:%f' % (name, depth), assert utils.is_normed(depth / ages[itree], this_eps=1e-6) # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision total += ages[itree] if self.args.debug > 1: print '' if self.args.debug: print ' branch lengths ok (mean %f)' % (total / len(ages))
def normalize(self, overflow_warn=True): # since when you normalize hists you have to make the arbitrary decision whether you're going to include the under/overflow bins (we don't include them here), in general we prefer to avoid having under/overflow entries """ NOTE does not multiply/divide by bin widths """ sum_value = 0.0 for ib in range(1, self.n_bins + 1): # don't include under/overflows sum_value += self.bin_contents[ib] if sum_value == 0.0: print 'WARNING sum zero in Hist::normalize(), returning without doing anything' return # make sure there's not too much stuff in the under/overflows if overflow_warn and (self.bin_contents[0]/sum_value > 1e-10 or self.bin_contents[self.n_bins+1]/sum_value > 1e-10): print 'WARNING under/overflows in Hist::normalize()' for ib in range(1, self.n_bins + 1): self.bin_contents[ib] /= sum_value if self.sum_weights_squared is not None: self.sum_weights_squared[ib] /= sum_value*sum_value if self.errors is not None: self.errors[ib] /= sum_value check_sum = 0.0 for ib in range(1, self.n_bins + 1): # check it check_sum += self.bin_contents[ib] assert is_normed(check_sum, this_eps=1e-10)
def read_insertion_content(self): if self.args.rearrange_from_scratch: return {b : {n : 1./len(utils.nukes) for n in utils.nukes} for b in utils.boundaries} insertion_content_probs = {} for bound in utils.boundaries: insertion_content_probs[bound] = {} with opener('r')(self.parameter_dir + '/' + bound + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: insertion_content_probs[bound][line[bound + '_insertion_content']] = int(line['count']) total += int(line['count']) for nuke in utils.nukes: if nuke not in insertion_content_probs[bound]: print ' %s not in insertion content probs, adding with zero' % nuke insertion_content_probs[bound][nuke] = 0 insertion_content_probs[bound][nuke] /= float(total) assert utils.is_normed(insertion_content_probs[bound]) return insertion_content_probs
def normalize(self, include_overflows=True, expect_empty=False, expect_overflows=False, overflow_eps_to_ignore=1e-15): sum_value = self.integral(include_overflows) imin, imax = self.get_bounds(include_overflows) if sum_value == 0.0: return if sum_value == 0.0: if not expect_empty: print 'WARNING sum zero in Hist::normalize()' return if not expect_overflows and not include_overflows and (self.bin_contents[0]/sum_value > overflow_eps_to_ignore or self.bin_contents[self.n_bins+1]/sum_value > overflow_eps_to_ignore): print 'WARNING under/overflows in Hist::normalize()' for ib in range(imin, imax): self.bin_contents[ib] /= sum_value if self.sum_weights_squared is not None: self.sum_weights_squared[ib] /= sum_value*sum_value if self.errors is not None: self.errors[ib] /= sum_value check_sum = 0.0 for ib in range(imin, imax): # check it check_sum += self.bin_contents[ib] if not is_normed(check_sum, this_eps=1e-10): raise Exception('not normalized: %f' % check_sum)
def read_mute_freqs(self, mute_freq_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now for mtype in [ 'all', ] + utils.regions: infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv' self.branch_lengths[mtype] = {} self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype][ 'probs'] = [], [] mutehist = plotting.make_hist_from_bin_entry_file( infname, mtype + '-mute-freqs') self.branch_lengths[mtype]['mean'] = mutehist.GetMean() if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent( mutehist.GetNbinsX() + 1) > 0.0: print 'WARNING nonzero under/overflow bins read from %s' % infname check_sum = 0.0 for ibin in range(1, mutehist.GetNbinsX() + 1): # ignore under/overflow bins freq = mutehist.GetBinCenter(ibin) branch_length = float(freq) prob = mutehist.GetBinContent(ibin) self.branch_lengths[mtype]['lengths'].append(branch_length) self.branch_lengths[mtype]['probs'].append(prob) check_sum += self.branch_lengths[mtype]['probs'][-1] assert utils.is_normed(check_sum) if self.args.debug: print ' mean branch lengths' for mtype in [ 'all', ] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % ( mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
def check_tree_lengths(self, treefname, ages): treestrs = [] with opener('r')(treefname) as treefile: for line in treefile: treestrs.append(line.split(';')[0] + ';') # ignore the info I added after the ';' if self.args.debug > 1: print ' checking branch lengths... ' assert len(treestrs) == len(ages) total = 0.0 for itree in range(len(ages)): if self.args.debug > 1: print ' asked for', ages[itree], for name, depth in get_leaf_node_depths(treestrs[itree]).items(): if self.args.debug > 1: print '%s:%f' % (name, depth), assert utils.is_normed( depth / ages[itree], this_eps=1e-6 ) # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision total += ages[itree] if self.args.debug > 1: print '' if self.args.debug: print ' branch lengths ok (mean %f)' % (total / len(ages))
def check_tree_lengths(self, treefname, ages): treestrs = [] with opener('r')(treefname) as treefile: for line in treefile: treestrs.append(line.split(';')[0] + ';') # ignore the info I added after the ';' if self.args.debug > 1: print ' checking branch lengths... ' assert len(treestrs) == len(ages) total_length, total_leaves = 0.0, 0 for itree in range(len(ages)): if self.args.debug > 1: print ' asked for', ages[itree], for name, depth in get_leaf_node_depths(treestrs[itree]).items(): if self.args.debug > 1: print '%s:%.8f' % (name, depth), if not utils.is_normed(depth / ages[itree], this_eps=1e-4): raise Exception('asked for branch length %.8f but got %.8f\n %s' % (ages[itree], depth, treestrs[itree])) # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision total_length += ages[itree] total_leaves += len(re.findall('t', treestrs[itree])) if self.args.debug > 1: print '' if self.args.debug: print ' mean branch length %.5f' % (total_length / len(ages)) print ' mean n leaves %.2f' % (float(total_leaves) / len(ages))
n_new_alleles = len(positions[mtype]) if len(positions[mtype]) != n_new_alleles: raise Exception('mismatched number of new alleles for %s' % ' vs '.join(mtypes)) if n_new_alleles is None: n_new_alleles = 0 for mtype in mtypes: if positions[mtype] is None: # if it wasn't specified at all, i.e. we don't want to generate any new alleles positions[mtype] = [[] for _ in range(n_new_alleles)] args.new_allele_info = [{'gene' : args.sim_v_genes[igene] if not args.gls_gen else None, 'snp-positions' : positions['snp'][igene], 'indel-positions' : positions['indel'][igene]} for igene in range(n_new_alleles)] if args.allele_prevalence_freqs is not None: # easier to check the length after we've generated snpd genes (above) if not utils.is_normed(args.allele_prevalence_freqs): raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs) if args.inf_glfo_dir is None: args.inf_glfo_dir = args.outdir + '/germlines/inference' if args.simfname is None: args.simfname = args.outdir + '/simu.csv' if args.seed is not None: random.seed(args.seed) numpy.random.seed(args.seed) if args.n_tests is not None: multiple_tests(args) else: run_tests(args)
def process(args): if args.action == 'run-viterbi': print ' note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (you don\'t need to change anything unless you want this warning message to go away)' args.action = 'annotate' if args.action == 'view-alternative-naive-seqs': print ' note: replacing deprecated action name \'view-alternative-naive-seqs\' with current name \'view-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)' args.action = 'view-alternative-annotations' args.light_chain_fractions = utils.get_arg_list(args.light_chain_fractions, key_val_pairs=True, floatify=True) if args.light_chain_fractions is not None and not utils.is_normed( args.light_chain_fractions.values()): raise Exception('--light-chain-fractions %s don\'t add to 1: %f' % (args.light_chain_fractions, sum(args.light_chain_fractions.values()))) if args.action == 'merge-paired-partitions': assert args.paired_loci if args.paired_loci: args.locus = None if [args.infname, args.paired_indir].count(None) == 0: raise Exception('can\'t specify both --infname and --paired-indir') if args.outfname is not None: raise Exception( 'can\'t set --outfname if --paired-loci is set (use --paired-outdir)' ) if args.plotdir == 'paired-outdir': args.plotdir = args.paired_outdir if args.plotdir is None and args.action == 'plot-partitions': args.plotdir = args.paired_outdir else: assert args.paired_indir is None if not args.paired_loci and (args.paired_indir is not None or args.paired_outdir is not None): raise Exception( '--paired-loci must be set if either --paired-indir or --paired-outdir is set' ) if args.reverse_negative_strands and not args.paired_loci: raise Exception( '--reverse-negative-strands has no effect unless --paired-loci is set (maybe need to run bin/split-loci.py separately?)' ) args.only_genes = utils.get_arg_list(args.only_genes) args.queries = utils.get_arg_list(args.queries) args.queries_to_include = utils.get_arg_list(args.queries_to_include) args.reco_ids = utils.get_arg_list(args.reco_ids) args.istartstop = utils.get_arg_list(args.istartstop, intify=True) if args.istartstop is not None: if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0: raise Exception('invalid --istartstop specification: %d %d' % (args.istartstop[0], args.istartstop[1])) args.n_max_per_region = utils.get_arg_list(args.n_max_per_region, intify=True) if len(args.n_max_per_region) != 3: raise Exception( 'n-max-per-region should be of the form \'x:y:z\', but I got ' + str(args.n_max_per_region)) args.write_additional_cluster_annotations = utils.get_arg_list( args.write_additional_cluster_annotations, intify=True) if args.write_additional_cluster_annotations is not None and len( args.write_additional_cluster_annotations) != 2: raise Exception( '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s' % args.write_additional_cluster_annotations) args.extra_annotation_columns = utils.get_arg_list( args.extra_annotation_columns, choices=utils.extra_annotation_headers) args.cluster_indices = utils.get_arg_list(args.cluster_indices, intify_with_ranges=True) args.allowed_cdr3_lengths = utils.get_arg_list(args.allowed_cdr3_lengths, intify=True) args.region_end_exclusions = { r: [ args.region_end_exclusion_length if ('%s_%s' % (r, e)) in utils.real_erosions else 0 for e in ['5p', '3p'] ] for r in utils.regions } args.region_end_exclusion_length = None # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version args.typical_genes_per_region_per_subject = utils.get_arg_list( args.typical_genes_per_region_per_subject, intify=True) if len(args.typical_genes_per_region_per_subject) != len(utils.regions): raise Exception( 'wrong length for --typical-genes-per-region-per-subject, has to be three' ) tmpfrac, ntmp = args.min_allele_prevalence_fraction, args.typical_genes_per_region_per_subject args.min_allele_prevalence_fractions = { r: tmpfrac * ntmp[utils.regions.index('v')] / ntmp[utils.regions.index(r)] for r in utils.regions } delattr(args, 'min_allele_prevalence_fraction') # delete the non-plural version delattr(args, 'typical_genes_per_region_per_subject' ) # and we don't need this any more either args.annotation_clustering_thresholds = utils.get_arg_list( args.annotation_clustering_thresholds, floatify=True) args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds, floatify=True) if args.small_clusters_to_ignore is not None: if '-' in args.small_clusters_to_ignore: lo, hi = [ int(cluster_size) for cluster_size in args.small_clusters_to_ignore.split('-') ] args.small_clusters_to_ignore = range(lo, hi + 1) else: args.small_clusters_to_ignore = utils.get_arg_list( args.small_clusters_to_ignore, intify=True) if args.seed_unique_id is not None: args.seed_unique_id = args.seed_unique_id.strip( ) # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign) if args.queries is not None and args.seed_unique_id not in args.queries: raise Exception('seed uid %s not in --queries %s' % (args.seed_unique_id, ' '.join(args.queries))) if args.random_seed_seq: raise Exception( 'can\'t specify both --seed-unique-id and --random-seed-seq') if args.queries_to_include is None: # make sure the seed is in --queries-to-include args.queries_to_include = [args.seed_unique_id] elif args.seed_unique_id not in args.queries_to_include: args.queries_to_include = [ args.seed_unique_id ] + args.queries_to_include # may as well put it first, I guess (?) elif args.seed_seq is not None: args.seed_unique_id = 'seed-seq' if args.sw_debug is None: # if not explicitly set, set equal to regular debug args.sw_debug = args.debug if args.only_genes is not None: for gene in args.only_genes: # make sure they're all at least valid ig genes utils.split_gene(gene) if args.print_git_commit or args.action == 'version': utils.get_version_info(debug=True) if args.action == 'version': sys.exit(0) args.is_data = not args.is_simu # whole code base uses is_data, this is better than changing all of that if args.collapse_duplicate_sequences and not args.is_data: print ' %s collapsing duplicates on simulation, which is often not a good idea since it makes keeping track of performance harder (e.g. purity/completeness of partitions is harder to calculate)' % utils.color( 'red', 'warning') if args.simultaneous_true_clonal_seqs: if args.is_data: raise Exception( 'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set' ) if args.n_simultaneous_seqs is not None: raise Exception( 'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs' ) if args.all_seqs_simultaneous: raise Exception( 'can\'t specify both --all-seqs-simultaneous and --simultaneous-true-clonal-seqs' ) if args.action == 'partition': raise Exception( 'can\'t set --simultaneous-true-clonal-seqs when partitioning') if args.n_simultaneous_seqs is not None and args.all_seqs_simultaneous: raise Exception( 'doesn\'t make sense to set both --n-simultaneous-seqs and --all-seqs-simultaneous.' ) if args.no_indels: print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty args.gap_open_penalty = args.no_indel_gap_open_penalty if args.indel_frequency > 0.: if args.indel_frequency < 0. or args.indel_frequency > 1.: raise Exception('--indel-frequency must be in [0., 1.] (got %f)' % args.indel_frequency) args.n_indels_per_indeld_seq = utils.get_arg_list( args.n_indels_per_indeld_seq, intify=True) if args.indel_location not in [None, 'v', 'cdr3']: if int(args.indel_location) in range(500): args.indel_location = int(args.indel_location) if any(n > 1 for n in args.n_indels_per_indeld_seq): print ' note: removing entries from --n-indels-per-indeld-seq (%s), since --indel-location was set to a single position.' % [ n for n in args.n_indels_per_indeld_seq if n > 1 ] args.n_indels_per_indeld_seq = [ n for n in args.n_indels_per_indeld_seq if n <= 1 ] else: raise Exception( '--indel-location \'%s\' neither one of None, \'v\' or \'cdr3\', nor an integer less than 500' % args.indel_location) if args.locus is not None and 'tr' in args.locus and args.mutation_multiplier is None: args.mutation_multiplier = 0. if args.workdir is None: # set default here so we know whether it was set by hand or not args.workdir = get_workdir(args.batch_system) else: args.workdir = args.workdir.rstrip('/') if os.path.exists(args.workdir): raise Exception('workdir %s already exists' % args.workdir) if args.batch_system == 'sge' and args.batch_options is not None: if '-e' in args.batch_options or '-o' in args.batch_options: print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % ( utils.color('red', 'warning'), args.workdir) if args.outfname is not None and not args.presto_output and not args.airr_output and not args.generate_trees: if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']: raise Exception('unhandled --outfname suffix %s' % utils.getsuffix(args.outfname)) if utils.getsuffix(args.outfname) != '.yaml': print ' %s --outfname uses deprecated file format %s. This will still mostly work ok, but the new default .yaml format doesn\'t have to do all the string conversions by hand (so is less buggy), and includes annotations, partitions, and germline info in the same file (so you don\'t get crashes or inconsistent results if you don\'t keep track of what germline info goes with what output file).' % ( utils.color('yellow', 'note:'), utils.getsuffix(args.outfname)) if args.action in ['view-annotations', 'view-partitions' ] and utils.getsuffix(args.outfname) == '.yaml': raise Exception( 'have to use \'view-output\' action to view .yaml output files' ) if args.presto_output: if args.outfname is None: raise Exception('have to set --outfname if --presto-output is set') if args.action == 'annotate' and utils.getsuffix( args.outfname) != '.tsv': raise Exception( '--outfname suffix has to be .tsv for annotation with --presto-output (got %s)' % utils.getsuffix(args.outfname)) if args.action == 'partition' and utils.getsuffix( args.outfname) not in ['.fa', '.fasta']: raise Exception( '--outfname suffix has to be .fa or .fasta for partitioning with --presto-output (got %s)' % utils.getsuffix(args.outfname)) if args.aligned_germline_fname is None: assert args.locus is not None args.aligned_germline_fname = '%s/%s/imgt-alignments/%s.fa' % ( args.default_initial_germline_dir, args.species, args.locus) if not os.path.exists(args.aligned_germline_fname): raise Exception( '--aligned-germline-fname %s doesn\'t exist, but we need it in order to write presto output' % args.aligned_germline_fname) if args.airr_output: if args.outfname is None: raise Exception('have to set --outfname if --airr-output is set') if utils.getsuffix(args.outfname) == '.tsv': print ' note: writing only airr .tsv to %s' % args.outfname elif utils.getsuffix(args.outfname) in ['.yaml', '.csv']: print ' note: writing both partis %s to %s and airr .tsv to %s' % ( utils.getsuffix(args.outfname), args.outfname, utils.replace_suffix(args.outfname, '.tsv')) else: raise Exception( '--outfname suffix has to be either .tsv or .yaml if --airr-output is set (got %s)' % utils.getsuffix(args.outfname)) if args.airr_input: args.seq_column = 'sequence' args.name_column = 'sequence_id' if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix( args.outfname ) == '.csv': # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format args.cluster_annotation_fname = utils.insert_before_suffix( '-cluster-annotations', args.outfname) if args.calculate_alternative_annotations and args.outfname is None and args.paired_outdir is None: raise Exception( 'have to specify --outfname in order to calculate alternative annotations' ) if args.subcluster_annotation_size == 'None': # i want it turned on by default, but also to be able to turn it off on the command line args.subcluster_annotation_size = None else: args.subcluster_annotation_size = int( args.subcluster_annotation_size ) # can't set it in add_argument(), sigh if args.subcluster_annotation_size is not None: if args.calculate_alternative_annotations or args.write_additional_cluster_annotations is not None: raise Exception( 'can\'t set either --calculate-alternative-annotations or --write-additional-cluster-annotations if --subcluster-annotation-size is also set (you get duplicate annotations, which confuses and crashes things, plus it doesn\'t really make sense -- alternative annotations should be calculated on the subcluster annotations now)' ) if args.action == 'view-alternative-annotations' and args.persistent_cachefname is None: # handle existing old-style output assert args.outfname is not None if os.path.exists(utils.getprefix(args.outfname) + '-hmm-cache.csv'): args.persistent_cachefname = utils.getprefix( args.outfname ) + '-hmm-cache.csv' # written by bcrham, so has to be csv, not yaml if args.min_largest_cluster_size is not None and args.n_final_clusters is not None: print ' note: both --min-largest-cluster-size and --n-final-clusters are set, which means we\'ll stop clustering when *either* of their criteria are satisfied (not both)' # maybe it should be both, but whatever if not args.paired_loci and (args.action == 'get-selection-metrics' or args.get_selection_metrics): if args.outfname is None and args.selection_metric_fname is None: print ' %s calculating selection metrics, but neither --outfname nor --selection-metric-fname were set, which means nothing will be written to disk' % utils.color( 'yellow', 'warning') elif args.selection_metric_fname is None and args.action == 'get-selection-metrics' and not args.add_selection_metrics_to_outfname: args.selection_metric_fname = utils.insert_before_suffix( '-selection-metrics', args.outfname) if args.plot_annotation_performance: if args.plotdir is None and args.print_n_worst_annotations is None: raise Exception( 'doesn\'t make sense to set --plot-annotation-performance but not either of --plotdir or --print-n-worst-annotations (we\'ll spend all the cycles counting things up but then they\'ll just disappear from memory without being recorded).' ) if not args.is_simu: raise Exception( 'can\'t plot performance unless --is-simu is set (and this is simulation)' ) if args.print_n_worst_annotations is not None and not args.plot_annotation_performance: raise Exception( '--plot-annotation-performance must be set if you\'re setting --print-worst-annotations' ) if not args.paired_loci and ( args.action == 'plot-partitions' or args.action == 'annotate' and args.plot_partitions) and args.plotdir is None: raise Exception('--plotdir must be specified if plotting partitions') if args.action == 'annotate' and args.plot_partitions and args.input_partition_fname is None: # could set this up to use e.g. --simultaneous-true-clonal-seqs as well, but it can't atm print ' %s running annotate with --plot-partitions, but --input-partition-fname is not set, which likely means the partitions will be trivial/singleton partitions' % utils.color( 'yellow', 'warning') if args.make_per_gene_per_base_plots and not args.make_per_gene_plots: # the former doesn't do anything unless the latter is turned on args.make_per_gene_plots = True if args.action == 'simulate': if args.n_trees is None and not args.paired_loci: args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs)) if args.n_procs > args.n_sim_events: print ' note: reducing --n-procs to %d (was %d) so it isn\'t bigger than --n-sim-events' % ( args.n_sim_events, args.n_procs) args.n_procs = args.n_sim_events if args.n_max_queries != -1: print ' note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)' if args.outfname is None and args.paired_outdir is None: print ' note: no %s specified, so nothing will be written to disk' % ( '--paired-outdir' if args.paired_loci else '--outfname') args.outfname = get_dummy_outfname( args.workdir ) # hackey, but otherwise I have to rewrite the whole run_simulation() in bin/partis to handle None type outfname if args.simulate_from_scratch: args.rearrange_from_scratch = True args.mutate_from_scratch = True if args.rearrange_from_scratch and not args.force_dont_generate_germline_set: # i would probably just default to always generating germline sets when rearranging from scratch, but bin/test-germline-inference.py (and any other case where you want to dramatically restrict the germline set) really argue for a way to force just using the genes in the germline dir args.generate_germline_set = True if args.flat_mute_freq or args.same_mute_freq_for_all_seqs: assert args.mutate_from_scratch if args.mutate_from_scratch and not args.no_per_base_mutation: print ' note: setting --no-per-base-mutation since --mutate-from-scratch was set' args.no_per_base_mutation = True # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed) if args.parameter_dir is not None: if args.rearrange_from_scratch or args.mutate_from_scratch: raise Exception( 'can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)' ) if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None: raise Exception( 'can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set' ) args.reco_parameter_dir = args.parameter_dir args.shm_parameter_dir = args.parameter_dir args.parameter_dir = None if args.rearrange_from_scratch and args.reco_parameter_dir is not None: raise Exception( 'doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir' ) if args.mutate_from_scratch and args.shm_parameter_dir is not None: raise Exception( 'doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir' ) if args.reco_parameter_dir is None and not args.rearrange_from_scratch: raise Exception( 'have to either set --rearrange-from-scratch or --reco-parameter-dir (or --simulate-from-scratch)' ) if args.shm_parameter_dir is None and not args.mutate_from_scratch: raise Exception( 'have to either set --mutate-from-scratch or --shm-parameter-dir (or --simulate-from-scratch)' ) if args.generate_germline_set and not args.rearrange_from_scratch: raise Exception( 'can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)' ) if args.generate_germline_set: args.snp_positions = None # if you want to control the exact positions, you have to use bin/test-germline-inference.py args.indel_positions = None process_gls_gen_args(args) if args.generate_trees: assert args.n_procs == 1 # not set up to handle output, and also no need if args.treefname is not None: raise Exception( '--treefname was set for simulation action (probably meant to use --input-simulation-treefname)' ) if args.parameter_dir is not None and not args.paired_loci: # if we're splitting loci, this isn't the normal parameter dir, it's a parent of that args.parameter_dir = args.parameter_dir.rstrip('/') if os.path.exists(args.parameter_dir): pdirs = [ d for d in os.listdir(args.parameter_dir) if os.path.isdir(d) ] if len(pdirs) > 0 and len( set(pdirs) & set(utils.parameter_type_choices)) == 0: raise Exception( 'couldn\'t find any expected parameter types (i.e. subdirs) in --parameter-dir \'%s\'. Allowed types: %s, found: %s. Maybe you added the parameter type to the parameter dir path?' % (args.parameter_dir, ' '.join( utils.parameter_type_choices), ' '.join( os.listdir(args.parameter_dir)))) if os.path.exists(args.default_initial_germline_dir + '/' + args.species): # ick that is hackey args.default_initial_germline_dir += '/' + args.species if args.species != 'human' and not args.allele_cluster: print ' non-human species \'%s\', turning on allele clustering' % args.species args.allele_cluster = True if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None: if args.n_max_snps > args.n_max_mutations_per_segment - 10: raise Exception( '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d' % (args.n_max_snps, args.n_max_mutations_per_segment)) if args.leave_default_germline: args.dont_remove_unlikely_alleles = True args.allele_cluster = False args.dont_find_new_alleles = True if args.action not in actions_not_requiring_input and [ args.infname, args.paired_indir ].count(None) == 2: if args.paired_loci: raise Exception( '--infname or --paired-indir is required for action \'%s\' with --paired-loci' % args.action) else: raise Exception('--infname is required for action \'%s\'' % args.action) if args.action == 'get-linearham-info': if args.linearham_info_fname is None: # for some reason setting required=True isn't working raise Exception('have to specify --linearham-info-fname') if args.sw_cachefname is None and args.parameter_dir is None: raise Exception( 'have to specify --sw-cachefname or --parameter-dir, since we need sw info to calculate linearham inputs' ) if args.extra_annotation_columns is None or 'linearham-info' not in args.extra_annotation_columns: args.extra_annotation_columns = utils.add_lists( args.extra_annotation_columns, ['linearham-info']) if args.ete_path is not None and args.ete_path == 'None': # it's nice to be able to unset this from the command line (so we don't make the slow tree plots) args.ete_path = None
def read_insertion_info(self, approved_genes): iprobs, icontentprobs = {}, {} genes_used = set() for insertion in self.insertions: iprobs[insertion] = {} if approved_genes[0] == glutils.dummy_d_genes[self.args.locus]: iprobs[insertion][0] = 1. # always insert zero bases icontentprobs[insertion] = {n: 0.25 for n in utils.nukes} continue deps = utils.column_dependencies[insertion + '_insertion'] with open( self.indir + '/' + utils.get_parameter_fname( column=insertion + '_insertion', deps=deps), 'r') as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[ self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then add in this insertion's counts n_inserted = 0 n_inserted = int(line[insertion + '_insertion']) if n_inserted not in iprobs[insertion]: iprobs[insertion][n_inserted] = 0.0 iprobs[insertion][n_inserted] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) if len(iprobs[insertion]) == 0: raise Exception( 'didn\'t read any %s insertion probs from %s' % (insertion, self.indir + '/' + utils.get_parameter_fname( column=insertion + '_insertion', deps=deps))) # print ' interpolate insertions' interpolate_bins( iprobs[insertion], self.n_max_to_interpolate, bin_eps=self.eps ) #, max_bin=len(self.germline_seq)) # NOTE that we normalize *after* this if 0 not in iprobs[insertion] or len( iprobs[insertion] ) < 2: # all hell breaks loose lower down if we haven't got shit in the way of information if self.debug: print ' WARNING adding pseudocount to 1-bin in insertion probs' iprobs[insertion][0] = 1 iprobs[insertion][1] = 1 if self.debug: print ' ', iprobs[insertion] assert 0 in iprobs[insertion] and len( iprobs[insertion] ) >= 2 # all hell breaks loose lower down if we haven't got shit in the way of information # and finally, normalize total = 0.0 for _, val in iprobs[insertion].iteritems(): total += val test_total = 0.0 for n_inserted in iprobs[insertion]: iprobs[insertion][n_inserted] /= total test_total += iprobs[insertion][n_inserted] assert utils.is_normed(test_total) if 0 not in iprobs[insertion] or iprobs[insertion][0] == 1.0: print 'ERROR cannot have all or none of the probability mass in the zero bin:', iprobs[ insertion] assert False icontentprobs[insertion] = self.read_insertion_content( insertion) # also read the base content of the insertions if len(genes_used ) > 1: # if length is 1, we will have just used the actual gene if self.debug: print ' insertions used:', ' '.join(genes_used) return iprobs, icontentprobs