def finalize(): if GC.random_number_seed is not None: from warnings import warn warn( "random_number_seed specified, but Pyvolve does not support seeding its random generator" ) makedirs("pyvolve_output", exist_ok=True) label_to_node = MF.modules['TreeNode'].label_to_node() for root, treestr in GC.pruned_newick_trees: # run Pyvolve treestr = treestr.strip() label = root.get_label() rootseq = root.get_seq() if GC.VERBOSE: print('[%s] Pyvolve evolving sequences on tree: %s' % (datetime.now(), treestr), file=stderr) print('[%s] Pyvolve root sequence: %s' % (datetime.now(), rootseq), file=stderr) if treestr != '(': treestr = '(%s);' % treestr[:-1] try: tree = pyvolve.read_tree(tree=treestr) partition = pyvolve.Partition(models=GC.pyvolve_model, root_sequence=rootseq) evolver = pyvolve.Evolver(partitions=partition, tree=tree) except NameError: import pyvolve tree = pyvolve.read_tree(tree=treestr) partition = pyvolve.Partition(models=GC.pyvolve_model, root_sequence=rootseq) evolver = pyvolve.Evolver(partitions=partition, tree=tree) except AssertionError: assert False, "Error setting up Pyvolve. Tree: %s" % treestr ratefile = "pyvolve_output/%s_ratefile.txt" % label # set each to None to not generate these files infofile = "pyvolve_output/%s_infofile.txt" % label seqfile = "pyvolve_output/%s_seqfile.fasta" % label evolver(ratefile=ratefile, infofile=infofile, seqfile=seqfile) seqs = evolver.get_sequences( ) # use anc=True to get internal sequences as well # store leaf sequences in GlobalContext if not hasattr( GC, 'final_sequences' ): # GC.final_sequences[cn_node][t] = set of (label,seq) tuples GC.final_sequences = {} for leaf in seqs: seq = seqs[leaf] virus_label, cn_label, sample_time = leaf.split('|') sample_time = float(sample_time) if cn_label not in GC.final_sequences: GC.final_sequences[cn_label] = {} if sample_time not in GC.final_sequences[cn_label]: GC.final_sequences[cn_label][sample_time] = [] GC.final_sequences[cn_label][sample_time].append((leaf, seq))
def run_u(self, tree_file, sequences_folder): with open(tree_file) as f: line = f.readline().strip() if "(" not in line or line == ";": return None else: my_tree = ete3.Tree(line, format=1) root = my_tree.get_tree_root() root.name = "Root" # in this case we need to read the multipliers # First we apply the multipliers per family # Second, the multipliers per species tree branch gf_multiplier = self.gf_multipliers[tree_file.split("_")[-2].split("/")[-1]] for node in my_tree.traverse(): node.dist = node.dist * gf_multiplier * self.st_multipliers[node.name.split("_")[0]] tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"]) name_mapping = self.get_mapping_internal_names(tree, my_tree) partition = pyvolve.Partition(models=self.model, size=self.size) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_") + "complete.fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Correct the names self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
def get_random_tree(filename, tree_string, L, kappa): # strains = read_in_strains(filename) # # L = genome_length(strains) # min_m = get_min_m(strains, L) # scaled_tree_string = scale_newick_format_tree(strains, L, min_m, tree_string) phylogeny = pyvolve.read_tree(tree = tree_string) # pyvolve.print_tree(phylogeny) freqs = [0.25,0.25,0.25,0.25] nuc_model = pyvolve.Model('nucleotide', {'kappa':kappa, 'state_freqs':freqs}) ancestor = generate_ancestor(L) print(ancestor) my_partition = pyvolve.Partition(models = nuc_model, root_sequence = ancestor) my_evolver = pyvolve.Evolver(partitions = my_partition, tree = phylogeny) my_evolver() # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # strains = my_evolver.get_sequences(anc = True) # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # print('pi: ' + str(pi)) # print('theta: ' + str(theta)) return {'pi': pi, 'theta': theta}
def test_OnSimulatedData(self): random.seed(1) divpressuresites = random.sample(range(self.nsites), 5) partitions = phydmslib.simulate.pyvolvePartitions(self.model, (200.0, divpressuresites)) evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve.read_tree(file=self.tree)) simulateprefix = os.path.join(self.outdir, self.modelname) simulatedalignment = simulateprefix + '_simulatedalignment.fasta' info = simulateprefix + '_temp_info.txt' rates = simulateprefix + '_temp_ratefile.txt' evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates) subprocess.check_call(['phydms', simulatedalignment, self.tree, self.modelarg, simulateprefix, '--omegabysite', '--brlen', 'scale']) omegabysitefile = simulateprefix + '_omegabysite.txt' omegas = pandas.read_csv(omegabysitefile, sep='\t', comment='#') divpressureomegas = omegas[omegas['site'].isin(divpressuresites)] self.assertTrue(len(divpressureomegas) == len(divpressuresites)) self.assertTrue((divpressureomegas['omega'].values > 2).all(), "Not all divpressure sites have omega > 2:\n{0}".format( divpressureomegas)) self.assertTrue((divpressureomegas['P'].values < 0.08).all(), "Not all divpressure sites have P < 0.08:\n{0}".format( divpressureomegas)) nspurious = len(omegas[(omegas['omega'] > 2) & (omegas['P'] < 0.05) & (~omegas['site'].isin(divpressuresites))]) self.assertTrue(nspurious <= 1, "{0} spurious sites".format(nspurious)) for f in ["custom_matrix_frequencies.txt"]: if os.path.isfile(f): os.remove(f)
def evolve(newicks, sequence_size, scale_tree): temp = "temporary_sequences.fasta" phy_files = [] my_model = pyvolve.Model("nucleotide") partition = pyvolve.Partition(models = my_model, size = sequence_size) for i in range(0, len(newicks)): newick = newicks[i] tree = pyvolve.read_tree(tree = newick, scale_tree = scale_tree) my_evolver = pyvolve.Evolver(tree = tree, partitions = partition) fasta_seqfile = "temp" + str(i) + ".fasta" phylip_seqfile = "temp" + str(i) + ".phyl" phy_files.append(phylip_seqfile) my_evolver(seqfile=fasta_seqfile, seqfmt = "fasta", ratefile = None, infofile = None) fasta_to_phyl(fasta_seqfile, phylip_seqfile) os.remove(fasta_seqfile) phyl_output = "temp_seq.phyl" with open(phyl_output, 'w') as outfile: for fname in phy_files: with open(fname) as infile: outfile.write(infile.read()) outfile.write("\n") os.remove(fname) return phyl_output
def simulate_genomes(model, tree, asize, outdir, number): path = mkdir(os.path.join(outdir, str(number))) partition = pyvolve.Partition(models=model, size=asize) evolver = pyvolve.Evolver(tree=tree, partitions=partition) evolver( seqfile=None, # , ratefile=os.path.join(path, "rate_{}.fasta".format(number)), infofile=None) return evolver.get_sequences()
def setUp(self): """Set up parameters for test.""" random.seed(1) scipy.random.seed(1) self.underflowfreq = 1 # define tree self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;') tempfile = '_temp.tree' with open(tempfile, 'w') as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, 'newick') os.remove(tempfile) # amino-acid preferences self.nsites = 50 prefs = [] minpref = 0.02 g = scipy.random.dirichlet([5] * N_NT) for r in range(self.nsites): rprefs = scipy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nseqs = self.tree.count_terminals() expcm = phydmslib.models.ExpCM(prefs) partitions = phydmslib.simulate.pyvolvePartitions(expcm) alignment = '_temp_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs # define model if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))
def test_OnSimulatedData(self): """Run ``phydms`` on the simulated data.""" random.seed(1) numpy.random.seed(1) partitions = phydmslib.simulate.pyvolvePartitions(self.model) evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve.read_tree(file=self.tree)) simulateprefix = os.path.join(self.outdir, self.modelname) simulatedalignment = simulateprefix + '_simulatedalignment.fasta' info = simulateprefix + '_temp_info.txt' rates = simulateprefix + '_temp_ratefile.txt' evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates) prefsbymethod = {} for fitprefsmethod in ['1', '2']: outprefix = simulateprefix + '_fitprefsmethod{0}'.format( fitprefsmethod) subprocess.check_call(['phydms', simulatedalignment, self.tree, self.modelarg, outprefix, '--diffprefsbysite', '--brlen', 'scale', '--ncpus', '-1', '--diffprefsprior', 'invquadratic,150,0.5'] + self.gammaomega_arg + ['--fitprefsmethod', fitprefsmethod]) diffprefsbysitefile = outprefix + '_diffprefsbysite.txt' aas = ['dpi_{0}'.format(INDEX_TO_AA[a]) for a in range(N_AA)] diffprefs = pandas.read_csv(diffprefsbysitefile, sep='\t', comment='#') diffprefs['total'] = diffprefs[aas].abs().sum(axis=1) for (site, a) in self.targetaas.items(): siteentry = diffprefs[diffprefs['site'] == site] self.assertTrue(len(siteentry) == 1, str(len(siteentry))) self.assertTrue((siteentry['dpi_{0}'.format(a)] > 0).all()) prefsbymethod[fitprefsmethod] = diffprefs for (i, (method1, prefs1)) in enumerate(sorted(prefsbymethod.items())): total1 = prefs1['total'].values for (method2, prefs2) in sorted(prefsbymethod.items())[i + 1:]: total2 = prefs2['total'].values (r, p) = scipy.stats.pearsonr(total1, total2) plt.scatter(total1, total2) plt.xlabel('fitprefsmethod{0}'.format(method1)) plt.ylabel('fitprefsmethod{0}'.format(method2)) plotfile = os.path.join(self.outdir, '{0}_vs_{1}.pdf'.format( method1, method2)) plt.savefig(plotfile) self.assertTrue(r > 0.98, "Low correlation between " "fitprefsmethods: {0}\nSee {1}" .format(r, plotfile)) for f in ["custom_matrix_frequencies.txt"]: if os.path.isfile(f): os.remove(f)
def get_random_tree(L, species, scaled_tree_string, kappa, iteration): # strains = read_in_strains(filename) # L = genome_length(strains) # min_m = get_min_m(strains, L) # max_m = get_max_m(strains, L, tree_string) # pis = [] # thetas = [] # scaled_trees = [] # for x in range(min_m,max_m+1): # scaled_tree_string = scale_newick_format_tree(strains, L, x, tree_string, increment) # scaled_trees.append(scaled_tree_string) # for tree in scaled_trees: phylogeny = pyvolve.read_tree(tree=scaled_tree_string) print('read in the tree') pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': kappa, 'state_freqs': freqs }) ancestor = generate_ancestor(L) print('generated an ancestor') # # print(ancestor) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) my_evolver(ratefile=None, infofile=None, seqfile="simulated_alignment_" + str(species[:-1]) + "_universal_" + str(iteration + 1) + ".fasta") # # my_evolver() print('evolved the sequences') # # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # # strains = my_evolver.get_sequences(anc = True) # # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # pis.append(pi) # thetas.append(theta) # # print('pi: ' + str(pi)) # # print('theta: ' + str(theta)) # return {'pi': pis, 'theta': thetas} return pi, theta
def simulateAlignment(model, treeFile, alignmentPrefix, randomSeed=False): """ Simulate an alignment given a model and tree (units = subs/site). Simulations done using `pyvolve`. Args: `model` (`phydmslib.models.Models` object) The model used for the simulations. Only models that can be passed to `pyvolve.Partitions` are supported. `treeFile` (str) Name of newick file used to simulate the sequences. The branch lengths should be in substitutions per site, which is the default units for all `phydms` outputs. `alignmentPrefix` Prefix for the files created by `pyvolve`. The result of this function is a simulated FASTA alignment file with the name having the prefix giving by `alignmentPrefix` and the suffix `'_simulatedalignment.fasta'`. """ if randomSeed == False: pass else: random.seed(randomSeed) #Transform the branch lengths by dividing by the model `branchScale` tree = Bio.Phylo.read(treeFile, 'newick') for node in tree.get_terminals() + tree.get_nonterminals(): if (node.branch_length == None) and (node == tree.root): node.branch_length = 1e-06 else: node.branch_length /= model.branchScale fd, temp_path = mkstemp() Bio.Phylo.write(tree, temp_path, 'newick') os.close(fd) pyvolve_tree = pyvolve.read_tree(file=temp_path) os.remove(temp_path) #Make the `pyvolve` partition partitions = pyvolvePartitions(model) #Simulate the alignment alignment = '{0}_simulatedalignment.fasta'.format(alignmentPrefix) info = '_temp_{0}info.txt'.format(alignmentPrefix) rates = '_temp_{0}_ratefile.txt'.format(alignmentPrefix) evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve_tree) evolver(seqfile=alignment, infofile=info, ratefile=rates) for f in [rates, info, "custom_matrix_frequencies.txt"]: if os.path.isfile(f): os.remove(f) assert os.path.isfile(alignment)
def simulate(f, seqfile, tree, mu_dict, length): ''' Simulate single partition according homogeneous mutation-selection model. ''' try: my_tree = pyvolve.read_tree(file=tree) except: my_tree = pyvolve.read_tree(tree=tree) model = pyvolve.Model("MutSel", {'state_freqs': f, 'mu': mu_dict}) part = pyvolve.Partition(size=length, models=model) e = pyvolve.Evolver(partitions=part, tree=my_tree) e(seqfile=seqfile, ratefile=None, infofile=None)
def get_accurate_c(L, kappa): ancestor = generate_ancestor(L) print(ancestor) # phylogeny = pyvolve.read_tree(tree = '( (t1:0.5,t2:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 , (t5:0.5,t6:0.5)i3:0.5, (t7:0.5,t8:0.5)i4:0.5 ) root;') phylogeny = pyvolve.read_tree( tree= '( ((t7:0.5,t8:0.5)i4:0.5,(t5:0.5,t6:0.5)i3:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 ) root;' ) pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': 1.86836732388, 'state_freqs': freqs }) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) # my_evolver() my_evolver(write_anc=True) # strains = my_evolver.get_sequences() strains = my_evolver.get_sequences(anc=True) strain_names = list(strains.keys()) # pre-order traversal of the tree n = len(strain_names) print(strain_names) c_sites = {} for key in strain_names: c_sites[key] = [] site_counts = L * [ None ] # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide strains_with_site = L * [ None ] # list of the strains that have a convergent mutation at each site; index = site for x in range(L): site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0} strains_with_site[x] = [] # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide c = 0 strain_names
def execute(tree, model, length, out, numSim): # read in model, tree, and define partition pyvolveModel = pyvolve.Model(model) pyvolveTree = pyvolve.read_tree(file=tree) pyvolvePartition = pyvolve.Partition(models=pyvolveModel, size=int(length)) # create evolver my_evolver = pyvolve.Evolver(tree=pyvolveTree, partitions=pyvolvePartition) my_evolver() print("Simulating sequences...") # create simluated sequences for i in range(int(numSim)): print(str(out) + "." + str(i) + ".fa") my_evolver(seqfile=str(out) + "." + str(model) + "-" + str(i) + ".fa")
def run(self, tree_file, sequences_folder): with open(tree_file) as f: line = f.readline().strip() if "(" not in line or line == ";": return None else: my_tree = ete3.Tree(line, format=1) tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"]) name_mapping = self.get_mapping_internal_names(tree, my_tree) partition = pyvolve.Partition(models=self.model, size=self.size) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Correct the names self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
def exampleFastaGenerator(nwkFile, fastaOutputLocation, seqLength, rate=1): # Tree. treeName = nwkFile[nwkFile.rindex('/'):] treeName = treeName.split('.')[0] phylogony = pyvolve.read_tree(file=nwkFile) # Rates. mutationRates = { "AC": rate, "AG": rate, "AT": rate, "CG": rate, "CT": rate, "GT": rate } # Model. model = pyvolve.Model("nucleotide", {"mu": mutationRates}) partition = pyvolve.Partition(models=model, size=seqLength) # Evolver. evolver = pyvolve.Evolver(partitions=[partition], tree=phylogony) evolver(seqfile=fastaOutputLocation, ratefile=None, infofile=None)
def evolve_nonconvergent_partition(g): if (g['num_convergent_site'] == 0): site_start = 1 else: site_start = g['num_simulated_site'] - g['num_convergent_site'] + 1 site_end = g['num_simulated_site'] print('Codon site {}-{}; Non-convergent codons'.format( site_start, site_end)) num_nonconvergent_site = g['num_simulated_site'] - g['num_convergent_site'] q_matrix = copy.copy(g['background_Q']) with suppress_stdout_stderr(): model = pyvolve.Model(model_type='custom', name='root', parameters={'matrix': q_matrix}) partition = pyvolve.Partition(models=model, size=num_nonconvergent_site) evolver = pyvolve.Evolver(partitions=partition, tree=g['background_tree']) evolver(ratefile='tmp.csubst.simulate_nonconvergent_ratefile.txt', infofile='tmp.csubst.simulate_nonconvergent_infofile.txt', seqfile='tmp.csubst.simulate_nonconvergent.fa', write_anc=False)
def simulate_single_sequence(self, name, gene_length, tree_file, sequences_folder): my_tree = "(A:1,B:1);".replace("A",name) tree = pyvolve.read_tree(tree=my_tree) partition = pyvolve.Partition(models=self.model, size=gene_length) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Select single sequence entries = list() for n, v in af.fasta_reader(os.path.join(sequences_folder, fasta_file)): if n[1:] != name: continue else: entries.append((n,v)) af.fasta_writer(os.path.join(sequences_folder, fasta_file), entries)
def generateTree(tns, ntaxa, seqlen): #Construct the tree and save as newick file t = dendropy.simulate.treesim.birth_death_tree(birth_rate=1.0, death_rate=0, taxon_namespace=tns, num_extant_tips=ntaxa) t.write(path='/tmp/pyvt', schema='newick', suppress_rooting=True, suppress_internal_node_labels=True) #Set pyvolve data type m1 = pyvolve.Model("nucleotide") p1 = pyvolve.Partition(models=m1, size=seqlen) #Read tree from dendropy pot = pyvolve.read_tree(file='/tmp/pyvt') #Simulate evolution with no save file e1 = pyvolve.Evolver(tree=pot, partitions=p1) e1(seqfile=None) seqs = e1.get_sequences() ds=dendropy.DnaCharacterMatrix.from_dict(seqs, taxon_namespace=tns) ds.write(path="evolvedsequences.fasta", schema="fasta") #print ds return t
def simulate(tree_index,length): """ Inputs: tree (integer 0-2) Outputs: array of 4 sequences, using the tree from above """ tree_map = ["alpha","beta","charlie"] tree = tree_map[tree_index] my_tree = pyvolve.read_tree(file = "trees/"+tree+".tre") #Idk weird pyvolve paramets parameters_omega = {"omega": 0.65} parameters_alpha_beta = {"beta": 0.65, "alpha": 0.98} # Corresponds to dN/dS = 0.65 / 0.98 my_model = pyvolve.Model("MG", parameters_alpha_beta) # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions (for a codon alignment, this means 250 codons, i.e. 750 nucleotide sites) my_partition = pyvolve.Partition(models = my_model, size = length) # Evolve! my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree, ratefile = None, infofile = None) my_evolver(ratefile = None, infofile = None) #Extract the sequences simulated_sequences = list(my_evolver.get_sequences().values()) return simulated_sequences
def setUp(self): """Set up for tests.""" scipy.random.seed(1) random.seed(1) nsites = 1 minpref = 0.001 self.prefs = [] self.realprefs = [] for r in range(nsites): rprefs = scipy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) scipy.random.shuffle(rprefs) self.realprefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) self.kappa = 3.0 self.omega = 3.0 self.phi = scipy.random.dirichlet([5] * N_NT) self.model = self.MODEL(self.prefs, prior=None, kappa=self.kappa, omega=self.omega, phi=self.phi) self.realmodel = phydmslib.models.ExpCM(self.realprefs, kappa=self.kappa, omega=self.omega, mu=10.0, phi=self.phi) treefile = os.path.abspath( os.path.join(os.path.dirname(__file__), './NP_data/NP_tree.newick')) self.tree = Bio.Phylo.read(treefile, 'newick') self.tree.root_at_midpoint() # simulate alignment using realmodel evolver = pyvolve.Evolver( partitions=phydmslib.simulate.pyvolvePartitions(self.realmodel), tree=pyvolve.read_tree(file=treefile)) alignmentfile = '_temp_fitprefs_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver(seqfile=alignmentfile, infofile=info, ratefile=rates) self.alignment = phydmslib.file_io.ReadCodonAlignment( alignmentfile, True) assert len(self.alignment[0][1]) == nsites * 3 for f in [alignmentfile, info, rates]: os.remove(f) self.codoncounts = dict([(r, dict([(INDEX_TO_CODON[c], 0) for c in range(N_CODON)])) for r in range(nsites)]) self.aacounts = dict([(r, dict([(a, 0) for a in range(N_AA)])) for r in range(nsites)]) for (head, seq) in self.alignment: self.codoncounts[r][seq] += 1 self.aacounts[r][CODON_TO_AA[CODON_TO_INDEX[seq]]] += 1 self.tl = phydmslib.treelikelihood.TreeLikelihood( self.tree, self.alignment, self.model)
def main(): """Main body of script.""" codons = pyvolve.genetics.Genetics().codons codon_dict = pyvolve.genetics.Genetics().codon_dict pyrims = pyvolve.genetics.Genetics().pyrims purines = pyvolve.genetics.Genetics().purines args = vars(ParseArguments().parse_args()) print("Read the following command line arguments:") print("\n\t{0}".format("\n\t".join( ["{0} = {1}".format(key, value) for (key, value) in args.items()]))) print("\nPerforming simulation with pyvolve version {0}".format( pyvolve.__version__)) print("\nReading model params from {0}".format(args['modelparams'])) params = ReadParams(args['modelparams']) for (param, paramvalue) in params.items(): print("The value of {0} is {1}".format(param, paramvalue)) print("\nReading preferences from {0}".format(args['prefs'])) tup = dms_tools.file_io.ReadPreferences(args['prefs']) (sites, pis) = (tup[0], tup[2]) print("\nRead amino-acid preferences for {0} sites".format(len(pis))) tree = pyvolve.read_tree(file=args['tree']) # create models for simulation partitions = [] for r in sites: if params['diversifyingsitesA'] and (int(r) in params['diversifyingsitesA']): omega = params['diversifyingomegaA'] print r, omega elif params['diversifyingsitesB'] and ( int(r) in params['diversifyingsitesB']): omega = params['diversifyingomegaB'] print r, omega else: omega = 1.0 matrix = [] # matrix[x][y] is rate of substitution from x to y for (xi, x) in enumerate(codons): row = [] for (yi, y) in enumerate(codons): ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]] if len(ntdiffs) == 0: assert x == y row.append( 0) # will later be adjusted to make row sum to zero elif len(ntdiffs) > 1: # multi-nucleotide codon change row.append(0) else: # single nucleotide change (xnt, ynt) = ntdiffs[0] if (xnt in purines) == (ynt in purines): # transition qxy = params['kappa'] * params['phi{0}'.format(ynt)] else: # transversion qxy = params['phi{0}'.format(ynt)] (xaa, yaa) = (codon_dict[x], codon_dict[y]) if xaa == yaa: fxy = 1.0 else: pix = pis[r][xaa]**params['stringencyparameter'] piy = pis[r][yaa]**params['stringencyparameter'] if abs(pix - piy) < 1e-6: fxy = omega else: fxy = omega * math.log( piy / pix) / (1.0 - pix / piy) row.append(qxy * fxy * params['scalerate']) assert len(row) == len(codons) row[xi] = -sum(row) matrix.append(row) model = pyvolve.Model("custom", {"matrix": matrix}) partitions.append(pyvolve.Partition(models=model, size=1)) print("\nSimulating evolution, writing to {0}...".format( args['simulatedalignment'])) basename = os.path.splitext(args['simulatedalignment'])[0] evolver = pyvolve.Evolver(partitions=partitions, tree=tree) evolver( seqfile=args['simulatedalignment'], infofile='{0}_infofile.txt'.format(basename), ratefile='{0}_ratefile.txt'.format(basename), ) print("Finished simulation") uniqueseqs = set([]) uniquealignment = [] ninitial = 0 for seq in Bio.SeqIO.parse(args['simulatedalignment'], 'fasta'): ninitial += 1 seqstr = str(seq.seq) if seqstr not in uniqueseqs: uniqueseqs.add(seqstr) uniquealignment.append(seq) print( "\nAfter removing redundant sequences, we have shrunk {0} from {1} to {2} sequences" .format(args['simulatedalignment'], ninitial, len(uniquealignment))) Bio.SeqIO.write(uniquealignment, args['simulatedalignment'], 'fasta')
# This example script demonstrates how to evolve according to a nucleotide model with several partitions. # In this example, the first partition has gamma-distributedsitewise rate heterogeneity, the second partition is homogenous, and the third partition has custom sitewise rate heterogeneity. # All models use default mutation-rate parameters import pyvolve # Define a phylogeny, from a file containing a newick tree my_tree = pyvolve.read_tree(file="file_with_tree.tre") # Define first model and partition. This partition has a length of 50 positions model1 = pyvolve.Model("nucleotide", alpha=0.7, num_categories=4) part1 = pyvolve.Partition(models=model1, size=50) # Define second model and partition. This partition has a length of 20 positions model2 = pyvolve.Model("nucleotide") part2 = pyvolve.Partition(models=model2, size=20) # Define second model and partition. This partition has a length of 100 positions model3 = pyvolve.Model("nucleotide", rate_factors=[0.5, 1.6, 4.1], rate_probs=[0.75, 0.2, 0.05]) part3 = pyvolve.Partition(models=model3, size=100) # Provide all partitions *in the order in which they should be evolved* to Evolver and evolve my_evolver = pyvolve.Evolver(partitions=[part1, part2, part3], tree=my_tree) my_evolver()
usage =''' python pyvolve-genseq.py <tree.nwk> <seq-size> [<scale> default=1 (no scale)] ''' if len(sys.argv) < 3: sys.exit(usage) tree_f = sys.argv[1] outfiles = tree_f size = sys.argv[2] scale = 1 scale = float(sys.argv[3]) if len(sys.argv) > 3 else None print("Reading tree..") my_tree = pyvolve.read_tree(file = tree_f, scale_tree=scale) my_model = pyvolve.Model("nucleotide") my_partition = pyvolve.Partition(models = my_model, size = int(size)) print("Simulating sequences..") my_evolver = pyvolve.Evolver(tree = my_tree, partitions = my_partition) my_evolver(ratefile = "%s.%s.ratefile.txt" % (outfiles, size), infofile = "%s.%s.infofile.txt" % (outfiles, size), seqfile = "%s.%s.seqfile.fasta" % (outfiles, size) ) print("Tree info..") tree_distances_info(tree_f, scale, int(size)) print("Running ANI on sequences..") pyani_seq("%s.%s.seqfile.fasta" % (outfiles, size))
def setUp(self): """Set up parameters for test.""" random.seed(1) scipy.random.seed(1) self.underflowfreq = 1 # define tree self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;') tempfile = '_temp.tree' with open(tempfile, 'w') as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, 'newick') os.remove(tempfile) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nsites = 50 self.nseqs = self.tree.count_terminals() e_pw = scipy.ndarray((3, N_NT), dtype='float') e_pw.fill(0.25) yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites) partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0) alignment = '_temp_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs # define model prefs = [] minpref = 0.02 g = scipy.random.dirichlet([5] * N_NT) for r in range(self.nsites): rprefs = scipy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure: divpressure = scipy.random.uniform(-1, 5, self.nsites) divpressure /= max(abs(divpressure)) self.model = phydmslib.models.ExpCM_empirical_phi_divpressure( prefs, g, divpressure) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = scipy.random.uniform(0.2, 0.8, size=(3, N_NT)) e_pw = e_pw / e_pw.sum(axis=1, keepdims=True) self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))
def setUp(self): """Set up parameters for test.""" random.seed(1) scipy.random.seed(1) # define tree self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;') tempfile = '_temp.tree' with open(tempfile, 'w') as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, 'newick') os.remove(tempfile) self.brlen = {} for (name, brlen) in re.findall(r'(?P<name>node\d):(?P<brlen>\d+\.\d+)', self.newick): if name != self.tree.root.name: i = name[-1] # node number self.brlen[int(i)] = float(brlen) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nsites = 60 self.nseqs = self.tree.count_terminals() e_pw = scipy.ndarray((3, N_NT), dtype='float') e_pw.fill(0.25) yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites) partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0) alignment = '_temp_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs self.codons = {} # indexed by node, site, gives codon index for node in self.tree.get_terminals(): node = node.name i = int(node[-1]) self.codons[i] = {} seq = [seq for (head, seq) in self.alignment if node == head][0] for r in range(self.nsites): codon = seq[3 * r:3 * r + 3] self.codons[i][r] = CODON_TO_INDEX[codon] # define model prefs = [] minpref = 0.02 g = scipy.random.dirichlet([5] * N_NT) g[g < 0.1] = 0.1 g /= g.sum() for r in range(self.nsites): rprefs = scipy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure: divpressure = scipy.random.uniform(-1, 5, self.nsites) divpressure /= max(abs(divpressure)) self.model = phydmslib.models.ExpCM_empirical_phi_divpressure( prefs, g, divpressure) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = scipy.random.uniform(0.2, 0.8, size=(3, N_NT)) e_pw = e_pw / e_pw.sum(axis=1, keepdims=True) self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedBetaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))
import sys, os import pyvolve import glob from mungo.fasta import FastaReader from collections import defaultdict input_fasta = sys.argv[1] input_tree_txt = sys.argv[2] output_seqs = sys.argv[3] #f = pyvolve.ReadFrequencies("amino_acid", file = "/Users/fengqian/Downloads/UniMelb_shared-master/project/mosaic_data/Protein_translateable_pilot_upper_centroids.fasta") #f = pyvolve.ReadFrequencies("amino_acid", file = "/data/cephfs/punim0609/qian_feng/snake_pipeline/data/Protein_translateable_pilot_upper_centroids.fasta") f = pyvolve.ReadFrequencies("amino_acid", file=input_fasta) frequencies = f.compute_frequencies() my_tree_1 = pyvolve.read_tree(file=input_tree_txt, scale_tree=0.5) my_model_1 = pyvolve.Model("MTMAM", {"state_freqs": frequencies}) my_partition_1 = pyvolve.Partition(models=my_model_1, size=200) my_evolver_1 = pyvolve.Evolver(partitions=my_partition_1, tree=my_tree_1) my_evolver_1(ratefile=None, infofile=None, seqfile=output_seqs) seqs = {} seq_list = [] count = 0 for h, s in FastaReader(output_seqs): seqs["seq" + str(count)] = s seq_list.append("seq" + str(count)) count += 1 ##organize the seq ID name with open(output_seqs, 'w') as outfile: for s in seq_list: outfile.write(">" + s + "\n" + seqs[s] + "\n")
# Define a phylogeny, from a file containing a newick tree my_tree = pyvolve.read_tree(file="file_with_tree.tre") # Define a nucleotide model, as a pyvolve.Model object. For this example, we'll use default parameters, but see the example script custom_aminoacid.py for other options # To implement rate heterogeneity, do either of these: ## 1) Custom rates: Provide a list of rate_factors when defining a Model object. These rate factors will be assigned to sites with equal probability by default. To change this, provide probabilities with the argument `rate_probs`. ## 2) Gamma rates: Provide the keyword arguments num_categories and alpha when defining a Model object. <num_categories> rates will be drawn from a gamma distribution with shape and scale parameter each equal to <alpha>. These rates will be equiprobable, unless overridden by `rate_probs`. # Several model definitions are shown below (first argument can be a different model, as desired). # custom rates my_model1 = pyvolve.Model( "WAG", rate_factors=[0.3, 0.8, 1.5, 2.45]) # 25% of sites will have each factor. my_model2 = pyvolve.Model( "WAG", rate_factors=[0.3, 0.8, 1.5, 2.45], rate_probs=[0.7, 0.2, 0.05, 0.05] ) # 70% of sites evolve with rate of 0.3, 20% with a rate of 0.8, 5% with a rate of 1.5, and 5% with a rate of 2.45 # gamma rates my_model3 = pyvolve.Model("WAG", alpha=0.6, num_categories=5) # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions my_partition = pyvolve.Partition(models=my_model2, size=250) # Evolve! my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree) my_evolver()
def get_c(L, kappa): ancestor = generate_ancestor(L) print(ancestor) phylogeny = pyvolve.read_tree( tree='((t1:0.5,t2:0.5)i1:0.5,(t3:0.5,t4:0.5)i2:0.5)root;') # '(t4:0.785,(t3:0.380,(t2:0.806,(t5:0.612,t1:0.660)i1:0.762)i2:0.921)i3:0.207)root;') # ((s1,s2)n1,(s3,s4)n2)n3 pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': 1.86836732388, 'state_freqs': freqs }) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) my_evolver() # my_evolver(write_anc = True) strains = my_evolver.get_sequences() # strains = my_evolver.get_sequences(anc = True) strain_names = list(strains.keys()) n = len(strain_names) site_counts = L * [ None ] # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide strains_with_site = L * [ None ] # list of the strains that have a convergent mutation at each site; index = site for x in range(L): site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0} strains_with_site[x] = [] # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide for s1 in range(n): strain1 = strains[strain_names[s1]] for s2 in range(s1, n): strain2 = strains[strain_names[s2]] for site in range(L): if strain1[site] == strain2[ site] and strain1[site] != ancestor[site]: if strain1 not in strains_with_site[ site]: # avoids double counting strain1 as convergent at that site strains_with_site[site].append(strain1) site_counts[site][strain1[site]] += 1 if strain2 not in strains_with_site[ site]: # avoids double counting strain2 as convergent at that site strains_with_site[site].append(strain2) site_counts[site][strain2[site]] += 1 c_q = (n - 1) * [ None ] # list of the number of convergent mutations between q strains; index = q - 2 nucleotides = ['A', 'T', 'G', 'C'] for x in range(n - 1): c_q[x] = 0 for site in site_counts: for base in nucleotides: for q in range(2, n + 1): if site[base] == (q): c_q[q - 2] += 1 c = sum(c_q) print(c) return c
def test_branchScale(self): """Simulate evolution, ensure scaled branches match number of subs.""" scipy.random.seed(1) random.seed(1) # define model, only free parameter is mu for testing simulations nsites = 50 prefs = [] minpref = 0.01 for r in range(nsites): rprefs = scipy.random.dirichlet([1] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) kappa = 4.2 omega = 0.4 beta = 1.5 mu = 0.3 if self.MODEL == phydmslib.models.ExpCM: phi = scipy.random.dirichlet([7] * N_NT) model = phydmslib.models.ExpCM(prefs, kappa=kappa, omega=omega, beta=beta, mu=mu, phi=phi, freeparams=['mu']) partitions = phydmslib.simulate.pyvolvePartitions(model) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: g = scipy.random.dirichlet([7] * N_NT) model = phydmslib.models.ExpCM_empirical_phi(prefs, g, kappa=kappa, omega=omega, beta=beta, mu=mu, freeparams=['mu']) partitions = phydmslib.simulate.pyvolvePartitions(model) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = scipy.asarray( [scipy.random.dirichlet([7] * N_NT) for i in range(3)]) model = phydmslib.models.YNGKP_M0(e_pw, nsites) partitions = phydmslib.simulate.pyvolvePartitions(model) else: raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL))) # tree is two sequences separated by a single branch t = 0.04 / model.branchScale newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0) pyvolvetree = pyvolve.read_tree(tree=newicktree) temptree = '_temp.tree' with open(temptree, 'w') as f: f.write(newicktree) biotree = Bio.Phylo.read(temptree, 'newick') os.remove(temptree) # Simulate evolution of two sequences separated by a long branch. # Then estimate subs per site in a heuristic way that will be # roughly correct for short branches. Do this all several times # and average results to get better accuracy. alignment = '_temp_branchScale_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) nsubs = 0 # subs in simulated seqs (estimate from Hamming distance) treedist = 0.0 # distance inferred by `TreeLikelihood` nreplicates = 100 for i in range(nreplicates): evolver(seqfile=alignment, infofile=info, ratefile=rates) a = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] assert len(a[0][1]) == len(a[1][1]) == nsites * 3 for f in [alignment, info, rates]: if os.path.isfile(f): os.remove(f) for r in range(nsites): codon1 = a[0][1][3 * r:3 * r + 3] codon2 = a[1][1][3 * r:3 * r + 3] nsubs += len([j for j in range(3) if codon1[j] != codon2[j]]) tl = phydmslib.treelikelihood.TreeLikelihood(biotree, a, model) tl.maximizeLikelihood() treedist += sum([n.branch_length for n in tl.tree.get_terminals()]) nsubs /= float(nsites * nreplicates) treedist /= float(nreplicates) # We expect nsubs = branchScale * t, but build in some tolerance # with rtol since we simulated finite number of sites. self.assertTrue( scipy.allclose(nsubs, model.branchScale * t, rtol=0.2), ("Simulated subs per site of {0} is not close " "to expected value of {1} (branchScale = {2}, t = {3})").format( nsubs, t * model.branchScale, model.branchScale, t)) self.assertTrue( scipy.allclose(treedist, nsubs, rtol=0.2), ("Simulated subs per site of {0} is not close to inferred " "branch length of {1}").format(nsubs, treedist))
def main(strain, seedFilepath, gffFilepath): for record in SeqIO.parse(seedFilepath, "fasta"): seedRec = record break gff_df = read_gff(gffFilepath) #get all the shuffle region prv = 0 pos_lst = [] for _, row in gff_df.iterrows(): pos_lst.append(("nc", prv, row["start"] - 1, "+")) pos_lst.append(("c", row["start"] - 1, row["end"], row["strand"])) prv = row["end"] pos_lst.append(("nc", prv, len(seedRec), "+")) # configuration for evolution treeFilepath = "tmp.tree" mytree = pyvolve.read_tree(file=treeFilepath) ncm = pyvolve.Model("nucleotide") # non-coding model cm = pyvolve.Model("ECMrest") # coding model outputSeq_lst = [Seq("") for _ in range(4)] # assuming tree has 4 nodes for pos in pos_lst: category, start, end, strand = pos # get rootSeq according to start, end, strand info rootSeq = seedRec.seq[start:end] if strand == "-": rootSeq = rootSeq.reverse_complement() rootSeq = str(rootSeq) # get simulated sequences if category == "nc": # partition = pyvolve.Partition(models = ncm, root_sequence = rootSeq) # evolver = pyvolve.Evolver(partition = partition, tree = mytree) # rec_lst = get_evolved(evolver) rec_lst = [SeqRecord(Seq(rootSeq)) for _ in range(4)] elif category == "c": partition = pyvolve.Partition( models=cm, root_sequence=rootSeq[3:-3]) #remove start & stop codon evolver = pyvolve.Evolver(partition=partition, tree=mytree) rec_lst = get_evolved(evolver) for rec in rec_lst: rec.seq = rootSeq[:3] + rec.seq + rootSeq[ -3:] #add last stop codon back assert len(rec_lst) == len(outputSeq_lst) # concat to outputSeq_lst for i, rec in enumerate(rec_lst): simSeq = rec.seq if strand == "-": simSeq = simSeq.reverse_complement() outputSeq_lst[i] += simSeq for i, outputSeq in enumerate(outputSeq_lst): genomeId = "{}_sim{}".format(strain, i + 1) outFilepath = "../data/dnaseq/{}.dnaseq".format(genomeId) with open(outFilepath, "w") as f: seqname = "{}:seq".format(genomeId) rec = SeqRecord(outputSeq, id=seqname, description="") SeqIO.write(rec, f, "fasta") print("DONE: output {}".format(outFilepath))