def add_shm_indels(self, reco_event): # NOTE that it will eventually make sense to add shared indel mutation according to the chosen tree -- i.e., probably, with some probability apply an indel instead of a point mutation if self.args.debug and self.args.indel_frequency > 0.: print ' indels' reco_event.indelfos = [ indelutils.get_empty_indel() for _ in range(len(reco_event.final_seqs)) ] for iseq in range(len(reco_event.final_seqs)): if self.args.indel_frequency == 0.: # no indels at all continue if numpy.random.uniform( 0, 1 ) > self.args.indel_frequency: # no indels for this sequence if self.args.debug: print ' 0' continue reco_event.indelfos[iseq]['reversed_seq'] = reco_event.final_seqs[ iseq] # set the original sequence (i.e. with all the indels reversed) n_indels = numpy.random.geometric( 1. / self.args.mean_indels_per_indeld_seq) if self.args.debug: print ' %d' % n_indels for _ in range(n_indels): # NOTE modifies <indelfo> and <codon_positions> reco_event.final_seqs[iseq] = indelutils.add_single_indel( reco_event.final_seqs[iseq], reco_event.indelfos[iseq], self.args.mean_indel_length, reco_event.final_codon_positions[iseq], indel_location=self.args.indel_location, debug=self.args.debug)
def add_shm_indels(self, reco_event): # NOTE that it will eventually make sense to add shared indel mutation according to the chosen tree -- i.e., probably, with some probability apply an indel instead of a point mutation if self.args.debug and self.args.indel_frequency > 0.: print ' indels' reco_event.indelfos = [ indelutils.get_empty_indel() for _ in range(len(reco_event.final_seqs)) ] for iseq in range(len(reco_event.final_seqs)): if self.args.indel_frequency == 0.: # no indels at all continue if numpy.random.uniform( 0, 1 ) > self.args.indel_frequency: # no indels for this sequence if self.args.debug: print ' 0' continue n_indels = numpy.random.choice(self.args.n_indels_per_indeld_seq) input_seq, indelfo = indelutils.add_indels( n_indels, reco_event.final_seqs[iseq], reco_event. recombined_seq, # NOTE modifies <indelfo> and <codon_positions> self.args.mean_indel_length, reco_event.final_codon_positions[iseq], indel_location=self.args.indel_location, dbg_pad=8, debug=self.args.debug) reco_event.final_seqs[iseq] = input_seq indelfo['genes'] = {r: reco_event.genes[r] for r in utils.regions} reco_event.indelfos[iseq] = indelfo
def try_scratch_erode_insert(self, tmpline, debug=False): utils.remove_all_implicit_info(tmpline) for erosion in utils.real_erosions: # includes various contortions to avoid eroding the entire gene region = erosion[0] gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']]) if region == 'd' and not utils.has_d_gene(self.args.locus): # dummy d genes: always erode the whole thing from the left assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.locus] tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0 else: max_erosion = max(0, gene_length/2 - 2) # heuristic if region in utils.conserved_codons[self.args.locus]: # make sure not to erode a conserved codon codon_pos = utils.cdn_pos(self.glfo, region, tmpline[region + '_gene']) if '3p' in erosion: n_bases_to_codon = gene_length - codon_pos - 3 elif '5p' in erosion: n_bases_to_codon = codon_pos max_erosion = min(max_erosion, n_bases_to_codon) tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1) for bound in utils.boundaries: mean_length = utils.scratch_mean_insertion_lengths[self.args.locus][bound] length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1 probs = [self.insertion_content_probs[bound][n] for n in utils.nukes] tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs)) if debug: print ' erosions: %s' % (' '.join([('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions])) print ' insertions: %s' % (' '.join([('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries])) # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator) gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions} for erosion in utils.real_erosions: region = erosion[0] e_length = tmpline[erosion + '_del'] if '5p' in erosion: gl_seqs[region] = gl_seqs[region][e_length:] elif '3p' in erosion: gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length] tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ] tmpline['unique_ids'] = [None] # this is kind of hackey, but some things in the implicit info adder use it to get the number of sequences tmpline['input_seqs'] = copy.deepcopy(tmpline['seqs']) # NOTE has to be updated _immediately_ so seqs and input_seqs don't get out of sync tmpline['indelfos'] = [indelutils.get_empty_indel(), ] utils.add_implicit_info(self.glfo, tmpline) assert len(tmpline['in_frames']) == 1
def add_mutants(self, reco_event, irandom): if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.: # some of the stuff below fails if mut mult is actually 0. reco_event.final_seqs.append( reco_event.recombined_seq) # set final sequnce in reco_event reco_event.indelfos = [ indelutils.get_empty_indel() for _ in range(len(reco_event.final_seqs)) ] return # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data) # This chosen depth corresponds to the sequence-wide mutation frequency. # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region. # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file). # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file treefostr = self.treeinfo[random.randint( 0, len(self.treeinfo) - 1 )] # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok. assert treefostr.count(';') == 1 isplit = treefostr.find(';') + 1 chosen_tree = treefostr[:isplit] # includes semi-colon mutefo = [rstr for rstr in treefostr[isplit:].split(',')] mean_total_height = treegenerator.get_mean_height(chosen_tree) regional_heights = { } # per-region height, including <self.args.mutation_multiplier> for tmpstr in mutefo: region, ratio = tmpstr.split(':') assert region in utils.regions ratio = float(ratio) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor ratio *= self.args.mutation_multiplier regional_heights[region] = mean_total_height * ratio scaled_trees = { r: treegenerator.rescale_tree(chosen_tree, regional_heights[r]) for r in utils.regions } if self.args.debug: print ' chose tree with total height %f' % treegenerator.get_mean_height( chosen_tree) print ' regional trees rescaled to heights: %s' % (' '.join([ '%s %.3f (expected %.3f)' % (region, treegenerator.get_mean_height( scaled_trees[region]), regional_heights[region]) for region in utils.regions ])) print treegenerator.get_ascii_tree(chosen_tree, extra_str=' ') n_leaves = treegenerator.get_n_leaves(chosen_tree) cmdfos = [] for region in utils.regions: simstr = reco_event.eroded_seqs[region] if region == 'd': simstr = reco_event.insertions[ 'vd'] + simstr + reco_event.insertions['dj'] cmdfos.append( self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaves, reco_event.genes[region], reco_event, seed=irandom)) utils.run_cmds( [cfo for cfo in cmdfos if cfo is not None], sleep=False) # shenanigan is to handle zero-length regional seqs mseqs = {} for ireg in range( len(utils.regions) ): # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.) if cmdfos[ireg] is None: mseqs[utils.regions[ireg]] = [ '' for _ in range(n_leaves) ] # return an empty string for each leaf node else: mseqs[utils.regions[ireg]] = self.read_bppseqgen_output( cmdfos[ireg], n_leaves) assert len(reco_event.final_seqs) == 0 for iseq in range(n_leaves): seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq] seq = reco_event.revert_conserved_codons( seq, debug=self.args.debug ) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append( seq) # set final sequnce in reco_event reco_event.final_codon_positions.append( copy.deepcopy(reco_event.post_erosion_codon_positions) ) # separate codon positions for each sequence, because of shm indels self.add_shm_indels(reco_event) reco_event.setline( irandom ) # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow self.check_tree_simulation(mean_total_height, regional_heights, scaled_trees, mseqs, reco_event) if self.args.debug: utils.print_reco_event(reco_event.line, extra_str=' ')