def finalize(): if GC.random_number_seed is not None: from warnings import warn warn( "random_number_seed specified, but Pyvolve does not support seeding its random generator" ) makedirs("pyvolve_output", exist_ok=True) label_to_node = MF.modules['TreeNode'].label_to_node() for root, treestr in GC.pruned_newick_trees: # run Pyvolve treestr = treestr.strip() label = root.get_label() rootseq = root.get_seq() if GC.VERBOSE: print('[%s] Pyvolve evolving sequences on tree: %s' % (datetime.now(), treestr), file=stderr) print('[%s] Pyvolve root sequence: %s' % (datetime.now(), rootseq), file=stderr) if treestr != '(': treestr = '(%s);' % treestr[:-1] try: tree = pyvolve.read_tree(tree=treestr) partition = pyvolve.Partition(models=GC.pyvolve_model, root_sequence=rootseq) evolver = pyvolve.Evolver(partitions=partition, tree=tree) except NameError: import pyvolve tree = pyvolve.read_tree(tree=treestr) partition = pyvolve.Partition(models=GC.pyvolve_model, root_sequence=rootseq) evolver = pyvolve.Evolver(partitions=partition, tree=tree) except AssertionError: assert False, "Error setting up Pyvolve. Tree: %s" % treestr ratefile = "pyvolve_output/%s_ratefile.txt" % label # set each to None to not generate these files infofile = "pyvolve_output/%s_infofile.txt" % label seqfile = "pyvolve_output/%s_seqfile.fasta" % label evolver(ratefile=ratefile, infofile=infofile, seqfile=seqfile) seqs = evolver.get_sequences( ) # use anc=True to get internal sequences as well # store leaf sequences in GlobalContext if not hasattr( GC, 'final_sequences' ): # GC.final_sequences[cn_node][t] = set of (label,seq) tuples GC.final_sequences = {} for leaf in seqs: seq = seqs[leaf] virus_label, cn_label, sample_time = leaf.split('|') sample_time = float(sample_time) if cn_label not in GC.final_sequences: GC.final_sequences[cn_label] = {} if sample_time not in GC.final_sequences[cn_label]: GC.final_sequences[cn_label][sample_time] = [] GC.final_sequences[cn_label][sample_time].append((leaf, seq))
def simulate(f, seqfile, tree, mu_dict, length): ''' Simulate single partition according homogeneous mutation-selection model. ''' try: my_tree = pyvolve.read_tree(file = tree) except: my_tree = pyvolve.read_tree(tree = tree) model = pyvolve.Model("MutSel", {'state_freqs':f, 'mu': mu_dict}) part = pyvolve.Partition(size = length, models = model) e = pyvolve.Evolver(partitions = part, tree = my_tree) e(seqfile = seqfile, ratefile = None, infofile = None)
def simulate(f, seqfile, tree, mu_dict, length): ''' Simulate single partition according homogeneous mutation-selection model. ''' try: my_tree = pyvolve.read_tree(file=tree) except: my_tree = pyvolve.read_tree(tree=tree) model = pyvolve.Model("MutSel", {'state_freqs': f, 'mu': mu_dict}) part = pyvolve.Partition(size=length, models=model) e = pyvolve.Evolver(partitions=part, tree=my_tree) e(seqfile=seqfile, ratefile=None, infofile=None)
def get_random_tree(filename, tree_string, L, kappa): # strains = read_in_strains(filename) # # L = genome_length(strains) # min_m = get_min_m(strains, L) # scaled_tree_string = scale_newick_format_tree(strains, L, min_m, tree_string) phylogeny = pyvolve.read_tree(tree = tree_string) # pyvolve.print_tree(phylogeny) freqs = [0.25,0.25,0.25,0.25] nuc_model = pyvolve.Model('nucleotide', {'kappa':kappa, 'state_freqs':freqs}) ancestor = generate_ancestor(L) print(ancestor) my_partition = pyvolve.Partition(models = nuc_model, root_sequence = ancestor) my_evolver = pyvolve.Evolver(partitions = my_partition, tree = phylogeny) my_evolver() # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # strains = my_evolver.get_sequences(anc = True) # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # print('pi: ' + str(pi)) # print('theta: ' + str(theta)) return {'pi': pi, 'theta': theta}
def test_OnSimulatedData(self): random.seed(1) divpressuresites = random.sample(range(self.nsites), 5) partitions = phydmslib.simulate.pyvolvePartitions(self.model, (200.0, divpressuresites)) evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve.read_tree(file=self.tree)) simulateprefix = os.path.join(self.outdir, self.modelname) simulatedalignment = simulateprefix + '_simulatedalignment.fasta' info = simulateprefix + '_temp_info.txt' rates = simulateprefix + '_temp_ratefile.txt' evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates) subprocess.check_call(['phydms', simulatedalignment, self.tree, self.modelarg, simulateprefix, '--omegabysite', '--brlen', 'scale']) omegabysitefile = simulateprefix + '_omegabysite.txt' omegas = pandas.read_csv(omegabysitefile, sep='\t', comment='#') divpressureomegas = omegas[omegas['site'].isin(divpressuresites)] self.assertTrue(len(divpressureomegas) == len(divpressuresites)) self.assertTrue((divpressureomegas['omega'].values > 2).all(), "Not all divpressure sites have omega > 2:\n{0}".format( divpressureomegas)) self.assertTrue((divpressureomegas['P'].values < 0.08).all(), "Not all divpressure sites have P < 0.08:\n{0}".format( divpressureomegas)) nspurious = len(omegas[(omegas['omega'] > 2) & (omegas['P'] < 0.05) & (~omegas['site'].isin(divpressuresites))]) self.assertTrue(nspurious <= 1, "{0} spurious sites".format(nspurious)) for f in ["custom_matrix_frequencies.txt"]: if os.path.isfile(f): os.remove(f)
def run_u(self, tree_file, sequences_folder): with open(tree_file) as f: line = f.readline().strip() if "(" not in line or line == ";": return None else: my_tree = ete3.Tree(line, format=1) root = my_tree.get_tree_root() root.name = "Root" # in this case we need to read the multipliers # First we apply the multipliers per family # Second, the multipliers per species tree branch gf_multiplier = self.gf_multipliers[tree_file.split("_")[-2].split("/")[-1]] for node in my_tree.traverse(): node.dist = node.dist * gf_multiplier * self.st_multipliers[node.name.split("_")[0]] tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"]) name_mapping = self.get_mapping_internal_names(tree, my_tree) partition = pyvolve.Partition(models=self.model, size=self.size) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_") + "complete.fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Correct the names self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
def evolve(newicks, sequence_size, scale_tree): temp = "temporary_sequences.fasta" phy_files = [] my_model = pyvolve.Model("nucleotide") partition = pyvolve.Partition(models = my_model, size = sequence_size) for i in range(0, len(newicks)): newick = newicks[i] tree = pyvolve.read_tree(tree = newick, scale_tree = scale_tree) my_evolver = pyvolve.Evolver(tree = tree, partitions = partition) fasta_seqfile = "temp" + str(i) + ".fasta" phylip_seqfile = "temp" + str(i) + ".phyl" phy_files.append(phylip_seqfile) my_evolver(seqfile=fasta_seqfile, seqfmt = "fasta", ratefile = None, infofile = None) fasta_to_phyl(fasta_seqfile, phylip_seqfile) os.remove(fasta_seqfile) phyl_output = "temp_seq.phyl" with open(phyl_output, 'w') as outfile: for fname in phy_files: with open(fname) as infile: outfile.write(infile.read()) outfile.write("\n") os.remove(fname) return phyl_output
def get_pyvolve_phylogeny_from_nxgraph(G, root_seq): ''' Transform the clonal graph into the format required by pyvolve. ''' tree_newick = networkx_to_newick(G) scale_tree = 1 / len(root_seq) tree = pyvolve.read_tree( tree=tree_newick.replace('a', 'a:' + str(scale_tree))) return tree
def setUp(self): """Set up parameters for test.""" random.seed(1) scipy.random.seed(1) self.underflowfreq = 1 # define tree self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;') tempfile = '_temp.tree' with open(tempfile, 'w') as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, 'newick') os.remove(tempfile) # amino-acid preferences self.nsites = 50 prefs = [] minpref = 0.02 g = scipy.random.dirichlet([5] * N_NT) for r in range(self.nsites): rprefs = scipy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nseqs = self.tree.count_terminals() expcm = phydmslib.models.ExpCM(prefs) partitions = phydmslib.simulate.pyvolvePartitions(expcm) alignment = '_temp_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs # define model if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))
def get_pyvolve_tree(tree, foreground_scaling_factor=1): if (foreground_scaling_factor != 1): print('Foreground branches are rescaled by {}.'.format( foreground_scaling_factor)) for node in tree.traverse(): if node.is_foreground: node.dist *= foreground_scaling_factor newick_txt = get_pyvolve_newick(tree=tree) pyvolve_tree = pyvolve.read_tree(tree=newick_txt) return pyvolve_tree
def test_OnSimulatedData(self): """Run ``phydms`` on the simulated data.""" random.seed(1) numpy.random.seed(1) partitions = phydmslib.simulate.pyvolvePartitions(self.model) evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve.read_tree(file=self.tree)) simulateprefix = os.path.join(self.outdir, self.modelname) simulatedalignment = simulateprefix + '_simulatedalignment.fasta' info = simulateprefix + '_temp_info.txt' rates = simulateprefix + '_temp_ratefile.txt' evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates) prefsbymethod = {} for fitprefsmethod in ['1', '2']: outprefix = simulateprefix + '_fitprefsmethod{0}'.format( fitprefsmethod) subprocess.check_call(['phydms', simulatedalignment, self.tree, self.modelarg, outprefix, '--diffprefsbysite', '--brlen', 'scale', '--ncpus', '-1', '--diffprefsprior', 'invquadratic,150,0.5'] + self.gammaomega_arg + ['--fitprefsmethod', fitprefsmethod]) diffprefsbysitefile = outprefix + '_diffprefsbysite.txt' aas = ['dpi_{0}'.format(INDEX_TO_AA[a]) for a in range(N_AA)] diffprefs = pandas.read_csv(diffprefsbysitefile, sep='\t', comment='#') diffprefs['total'] = diffprefs[aas].abs().sum(axis=1) for (site, a) in self.targetaas.items(): siteentry = diffprefs[diffprefs['site'] == site] self.assertTrue(len(siteentry) == 1, str(len(siteentry))) self.assertTrue((siteentry['dpi_{0}'.format(a)] > 0).all()) prefsbymethod[fitprefsmethod] = diffprefs for (i, (method1, prefs1)) in enumerate(sorted(prefsbymethod.items())): total1 = prefs1['total'].values for (method2, prefs2) in sorted(prefsbymethod.items())[i + 1:]: total2 = prefs2['total'].values (r, p) = scipy.stats.pearsonr(total1, total2) plt.scatter(total1, total2) plt.xlabel('fitprefsmethod{0}'.format(method1)) plt.ylabel('fitprefsmethod{0}'.format(method2)) plotfile = os.path.join(self.outdir, '{0}_vs_{1}.pdf'.format( method1, method2)) plt.savefig(plotfile) self.assertTrue(r > 0.98, "Low correlation between " "fitprefsmethods: {0}\nSee {1}" .format(r, plotfile)) for f in ["custom_matrix_frequencies.txt"]: if os.path.isfile(f): os.remove(f)
def get_random_tree(L, species, scaled_tree_string, kappa, iteration): # strains = read_in_strains(filename) # L = genome_length(strains) # min_m = get_min_m(strains, L) # max_m = get_max_m(strains, L, tree_string) # pis = [] # thetas = [] # scaled_trees = [] # for x in range(min_m,max_m+1): # scaled_tree_string = scale_newick_format_tree(strains, L, x, tree_string, increment) # scaled_trees.append(scaled_tree_string) # for tree in scaled_trees: phylogeny = pyvolve.read_tree(tree=scaled_tree_string) print('read in the tree') pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': kappa, 'state_freqs': freqs }) ancestor = generate_ancestor(L) print('generated an ancestor') # # print(ancestor) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) my_evolver(ratefile=None, infofile=None, seqfile="simulated_alignment_" + str(species[:-1]) + "_universal_" + str(iteration + 1) + ".fasta") # # my_evolver() print('evolved the sequences') # # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # # strains = my_evolver.get_sequences(anc = True) # # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # pis.append(pi) # thetas.append(theta) # # print('pi: ' + str(pi)) # # print('theta: ' + str(theta)) # return {'pi': pis, 'theta': thetas} return pi, theta
def simulateAlignment(model, treeFile, alignmentPrefix, randomSeed=False): """ Simulate an alignment given a model and tree (units = subs/site). Simulations done using `pyvolve`. Args: `model` (`phydmslib.models.Models` object) The model used for the simulations. Only models that can be passed to `pyvolve.Partitions` are supported. `treeFile` (str) Name of newick file used to simulate the sequences. The branch lengths should be in substitutions per site, which is the default units for all `phydms` outputs. `alignmentPrefix` Prefix for the files created by `pyvolve`. The result of this function is a simulated FASTA alignment file with the name having the prefix giving by `alignmentPrefix` and the suffix `'_simulatedalignment.fasta'`. """ if randomSeed == False: pass else: random.seed(randomSeed) #Transform the branch lengths by dividing by the model `branchScale` tree = Bio.Phylo.read(treeFile, 'newick') for node in tree.get_terminals() + tree.get_nonterminals(): if (node.branch_length == None) and (node == tree.root): node.branch_length = 1e-06 else: node.branch_length /= model.branchScale fd, temp_path = mkstemp() Bio.Phylo.write(tree, temp_path, 'newick') os.close(fd) pyvolve_tree = pyvolve.read_tree(file=temp_path) os.remove(temp_path) #Make the `pyvolve` partition partitions = pyvolvePartitions(model) #Simulate the alignment alignment = '{0}_simulatedalignment.fasta'.format(alignmentPrefix) info = '_temp_{0}info.txt'.format(alignmentPrefix) rates = '_temp_{0}_ratefile.txt'.format(alignmentPrefix) evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve_tree) evolver(seqfile=alignment, infofile=info, ratefile=rates) for f in [rates, info, "custom_matrix_frequencies.txt"]: if os.path.isfile(f): os.remove(f) assert os.path.isfile(alignment)
def engrave_tree(treestr, brk, nbranch, lfile): """Inscribe pyvolve model flags into a tree string according to the given pattern.""" model_flags = [ "_bp%d_" % (len(brk) - brk.index(i)) if i in brk else '' for i in xrange(0, nbranch) ] branch_strs = re.split(r':', treestr) flagged = (re.sub(r'(\d+\.\d+[eE]?-?\d*)', r':\1' + f, b) for b, f in zip(branch_strs[1:], model_flags)) l_tree_string = branch_strs[0] + ''.join(flagged) + ';' lfile.write(l_tree_string + '\n') ltree = pyvolve.read_tree(tree=l_tree_string) return ltree
def sim_gtr(rates, freqs, alpha, tree, nsites): import pyvolve custom_mu = {} for r, val in zip(['AC', 'AG', 'AT', 'CG', 'CT', 'CG'], rates): custom_mu[r]= val gtr_model = pyvolve.Model('nucleotide', {'mu':custom_mu, 'state_freqs':freqs}, alpha = alpha, num_categories = 4) gtr_partition = pyvolve.Partition(models = gtr_model, size = nsites) tr = pyvolve.read_tree(tree = tree) gtr_evolver = pyvolve.Evolver(partitions = gtr_partition, tree = tr) gtr_evolver() return dict_to_matrix(gtr_evolver.get_sequences())
def sim_codon(freqs, omegas, tree, nsites): import pyvolve tr = pyvolve.read_tree(tree = tree) #temporary convert tree gy_model = pyvolve.Model('MG', {'omega':omegas, 'state_freqs':freqs}) # Note that the number of sites should be divided by 3! # test using three partitions gy_partition = pyvolve.Partition(models = gy_model, size = nsites/3) gy_evolver = pyvolve.Evolver(partitions = gy_partition, tree = tr) gy_evolver() return dict_to_matrix(gy_evolver.get_sequences())
def evolveLinker(sequence, branchLength): """ Evolves non-domain sequence a specified distance using pyvolve. Simulates substitutions only (no indels). branchLength * sequence is the expected fraction of positions to mutate (with replacement). Returns sequence post modification. """ m = Model("JTT") p = Partition(models=m, root_sequence=sequence) #t = (A:BL,b:BL) t = read_tree(tree="(A:" + str(branchLength) + ",B:" + str(branchLength) + ");") e = Evolver(partitions=p, tree=t) e() return e.get_sequences()["A"]
def get_accurate_c(L, kappa): ancestor = generate_ancestor(L) print(ancestor) # phylogeny = pyvolve.read_tree(tree = '( (t1:0.5,t2:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 , (t5:0.5,t6:0.5)i3:0.5, (t7:0.5,t8:0.5)i4:0.5 ) root;') phylogeny = pyvolve.read_tree( tree= '( ((t7:0.5,t8:0.5)i4:0.5,(t5:0.5,t6:0.5)i3:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 ) root;' ) pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': 1.86836732388, 'state_freqs': freqs }) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) # my_evolver() my_evolver(write_anc=True) # strains = my_evolver.get_sequences() strains = my_evolver.get_sequences(anc=True) strain_names = list(strains.keys()) # pre-order traversal of the tree n = len(strain_names) print(strain_names) c_sites = {} for key in strain_names: c_sites[key] = [] site_counts = L * [ None ] # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide strains_with_site = L * [ None ] # list of the strains that have a convergent mutation at each site; index = site for x in range(L): site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0} strains_with_site[x] = [] # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide c = 0 strain_names
def execute(tree, model, length, out, numSim): # read in model, tree, and define partition pyvolveModel = pyvolve.Model(model) pyvolveTree = pyvolve.read_tree(file=tree) pyvolvePartition = pyvolve.Partition(models=pyvolveModel, size=int(length)) # create evolver my_evolver = pyvolve.Evolver(tree=pyvolveTree, partitions=pyvolvePartition) my_evolver() print("Simulating sequences...") # create simluated sequences for i in range(int(numSim)): print(str(out) + "." + str(i) + ".fa") my_evolver(seqfile=str(out) + "." + str(model) + "-" + str(i) + ".fa")
def run(self, tree_file, sequences_folder): with open(tree_file) as f: line = f.readline().strip() if "(" not in line or line == ";": return None else: my_tree = ete3.Tree(line, format=1) tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"]) name_mapping = self.get_mapping_internal_names(tree, my_tree) partition = pyvolve.Partition(models=self.model, size=self.size) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Correct the names self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
def exampleFastaGenerator(nwkFile, fastaOutputLocation, seqLength, rate=1): # Tree. treeName = nwkFile[nwkFile.rindex('/'):] treeName = treeName.split('.')[0] phylogony = pyvolve.read_tree(file=nwkFile) # Rates. mutationRates = { "AC": rate, "AG": rate, "AT": rate, "CG": rate, "CT": rate, "GT": rate } # Model. model = pyvolve.Model("nucleotide", {"mu": mutationRates}) partition = pyvolve.Partition(models=model, size=seqLength) # Evolver. evolver = pyvolve.Evolver(partitions=[partition], tree=phylogony) evolver(seqfile=fastaOutputLocation, ratefile=None, infofile=None)
def simulate_single_sequence(self, name, gene_length, tree_file, sequences_folder): my_tree = "(A:1,B:1);".replace("A",name) tree = pyvolve.read_tree(tree=my_tree) partition = pyvolve.Partition(models=self.model, size=gene_length) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Select single sequence entries = list() for n, v in af.fasta_reader(os.path.join(sequences_folder, fasta_file)): if n[1:] != name: continue else: entries.append((n,v)) af.fasta_writer(os.path.join(sequences_folder, fasta_file), entries)
def generateTree(tns, ntaxa, seqlen): #Construct the tree and save as newick file t = dendropy.simulate.treesim.birth_death_tree(birth_rate=1.0, death_rate=0, taxon_namespace=tns, num_extant_tips=ntaxa) t.write(path='/tmp/pyvt', schema='newick', suppress_rooting=True, suppress_internal_node_labels=True) #Set pyvolve data type m1 = pyvolve.Model("nucleotide") p1 = pyvolve.Partition(models=m1, size=seqlen) #Read tree from dendropy pot = pyvolve.read_tree(file='/tmp/pyvt') #Simulate evolution with no save file e1 = pyvolve.Evolver(tree=pot, partitions=p1) e1(seqfile=None) seqs = e1.get_sequences() ds=dendropy.DnaCharacterMatrix.from_dict(seqs, taxon_namespace=tns) ds.write(path="evolvedsequences.fasta", schema="fasta") #print ds return t
def simulate(tree_index,length): """ Inputs: tree (integer 0-2) Outputs: array of 4 sequences, using the tree from above """ tree_map = ["alpha","beta","charlie"] tree = tree_map[tree_index] my_tree = pyvolve.read_tree(file = "trees/"+tree+".tre") #Idk weird pyvolve paramets parameters_omega = {"omega": 0.65} parameters_alpha_beta = {"beta": 0.65, "alpha": 0.98} # Corresponds to dN/dS = 0.65 / 0.98 my_model = pyvolve.Model("MG", parameters_alpha_beta) # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions (for a codon alignment, this means 250 codons, i.e. 750 nucleotide sites) my_partition = pyvolve.Partition(models = my_model, size = length) # Evolve! my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree, ratefile = None, infofile = None) my_evolver(ratefile = None, infofile = None) #Extract the sequences simulated_sequences = list(my_evolver.get_sequences().values()) return simulated_sequences
def cli(gnumber, glist, gtree, edprob, gsize, glen_range, dnds, tau=None, delrate=0.0, from_al=None, protlike=False, no_syn=False, sub_rate=1.0, min_cons=0.0, outdir=""): """Extract genome content based on a list of species """ gleaf = [] no_edit = [] tree = None if gnumber: gleaf = ['Genome_{}'.format(i) for i in range(1, gnumber + 1)] elif glist: with open(glist) as G: for line in Glist: line = line.strip() if line and not line.startswith('#'): gleaf.append(line.strip('-_')) if line.startswith('-') or line.startswith('_'): no_edit.append(line.strip('-_')) elif gtree: tree = Tree(gtree) gleaf = tree.get_leaf_names() no_edit = [x.strip('_') for x in gleaf if x.startswith('_')] for node in tree: node.name = node.name.strip('_') else: raise NotImplementedError( "One of --gnumber, --glist and --gtree is needed !") if not tree: tree = Tree() tree.populate(len(gleaf), names_library=gleaf, random_branches=True) param_list = {"alpha": dnds[1], "beta": dnds[0]} if tau: param_list.update({"kappa": tau}) if from_al: # read codons frequencies from an existing alignment f = pyvolve.ReadFrequencies("codon", file=from_al) param_list.update({'state_freqs': f.compute_frequencies()}) #print(tree.get_ascii(show_internal=True, attributes=['name', 'dist'])) phylogeny = pyvolve.read_tree(tree=tree.write(format=5), scale_tree=sub_rate) codon_model = pyvolve.Model("codon", param_list) #, neutral_scaling=True) sequences = [] edited_sequences = [] truth_tables = [] # add height to tree tree = add_height_to_tree(tree) for i in range(gsize): # gene length is given from an uniform distribution alen = np.random.randint(glen_range[0], glen_range[1]) * 3 seq = simulate_genomes(codon_model, phylogeny, alen, outdir, i + 1) if delrate: seq = random_deletion(seq, tree, alen // 3, delrate) if protlike: for k in seq: seq[k] = 'ATG' + seq[k] sequences.append(seq) edited_seq, truth_table = CtoUsimulate(seq, tree, no_edit, edprob, no_syn=no_syn, min_cons=min_cons) edited_sequences.append(edited_seq) truth_tables.append(truth_table) save_data(tree, seq, edited_seq, truth_table, outdir, i + 1)
Simulate sequences along a phylogenetic tree using pyvolve @author: david """ import pyvolve "User defined params" mut_rate = 0.005 freqs = [0.25, 0.25, 0.25, 0.25] seq_length = 1000 kappa = 2.75 "Read in phylogeny along which Pyvolve should simulate" "Scale_tree sets absolute mutation rate" my_tree = pyvolve.read_tree(file = "AMR-sim.tre", scale_tree = mut_rate) #pyvolve.print_tree(my_tree) # Print the parsed phylogeny "Specify nucleotide substitution model with custom rates" #custom_mu = {"AC":0.5, "AG":0.25, "AT":1.23, "CG":0.55, "CT":1.22, "GT":0.47} #nuc_model = pyvolve.Model( "nucleotide", {"mu":custom_mu, "state_freqs":freqs} ) "Or just use an HKY model with kappa" nuc_model = pyvolve.Model( "nucleotide", {"kappa":kappa, "state_freqs":freqs}) "Define a Partition object which evolves set # of positions according to my_model" my_partition = pyvolve.Partition(models = nuc_model, size = seq_length) #my_partition = pyvolve.Partition(models = nuc_model, root_sequence = "GATAGAAC") # Or with a root seq "Define an Evolver instance to evolve a single partition" my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree)
# This example script demonstrates how to evolve according to an amino-acid model with sitewise rate heterogeneity. import pyvolve # Define a phylogeny, from a file containing a newick tree my_tree = pyvolve.read_tree(file="file_with_tree.tre") # Define a nucleotide model, as a pyvolve.Model object. For this example, we'll use default parameters, but see the example script custom_aminoacid.py for other options # To implement rate heterogeneity, do either of these: ## 1) Custom rates: Provide a list of rate_factors when defining a Model object. These rate factors will be assigned to sites with equal probability by default. To change this, provide probabilities with the argument `rate_probs`. ## 2) Gamma rates: Provide the keyword arguments num_categories and alpha when defining a Model object. <num_categories> rates will be drawn from a gamma distribution with shape and scale parameter each equal to <alpha>. These rates will be equiprobable, unless overridden by `rate_probs`. # Several model definitions are shown below (first argument can be a different model, as desired). # custom rates my_model1 = pyvolve.Model( "WAG", rate_factors=[0.3, 0.8, 1.5, 2.45]) # 25% of sites will have each factor. my_model2 = pyvolve.Model( "WAG", rate_factors=[0.3, 0.8, 1.5, 2.45], rate_probs=[0.7, 0.2, 0.05, 0.05] ) # 70% of sites evolve with rate of 0.3, 20% with a rate of 0.8, 5% with a rate of 1.5, and 5% with a rate of 2.45 # gamma rates my_model3 = pyvolve.Model("WAG", alpha=0.6, num_categories=5) # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions my_partition = pyvolve.Partition(models=my_model2, size=250)
def main(): """Main body of script.""" codons = pyvolve.genetics.Genetics().codons codon_dict = pyvolve.genetics.Genetics().codon_dict pyrims = pyvolve.genetics.Genetics().pyrims purines = pyvolve.genetics.Genetics().purines args = vars(ParseArguments().parse_args()) print("Read the following command line arguments:") print("\n\t{0}".format("\n\t".join( ["{0} = {1}".format(key, value) for (key, value) in args.items()]))) print("\nPerforming simulation with pyvolve version {0}".format( pyvolve.__version__)) print("\nReading model params from {0}".format(args['modelparams'])) params = ReadParams(args['modelparams']) for (param, paramvalue) in params.items(): print("The value of {0} is {1}".format(param, paramvalue)) print("\nReading preferences from {0}".format(args['prefs'])) tup = dms_tools.file_io.ReadPreferences(args['prefs']) (sites, pis) = (tup[0], tup[2]) print("\nRead amino-acid preferences for {0} sites".format(len(pis))) tree = pyvolve.read_tree(file=args['tree']) # create models for simulation partitions = [] for r in sites: if params['diversifyingsitesA'] and (int(r) in params['diversifyingsitesA']): omega = params['diversifyingomegaA'] print r, omega elif params['diversifyingsitesB'] and ( int(r) in params['diversifyingsitesB']): omega = params['diversifyingomegaB'] print r, omega else: omega = 1.0 matrix = [] # matrix[x][y] is rate of substitution from x to y for (xi, x) in enumerate(codons): row = [] for (yi, y) in enumerate(codons): ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]] if len(ntdiffs) == 0: assert x == y row.append( 0) # will later be adjusted to make row sum to zero elif len(ntdiffs) > 1: # multi-nucleotide codon change row.append(0) else: # single nucleotide change (xnt, ynt) = ntdiffs[0] if (xnt in purines) == (ynt in purines): # transition qxy = params['kappa'] * params['phi{0}'.format(ynt)] else: # transversion qxy = params['phi{0}'.format(ynt)] (xaa, yaa) = (codon_dict[x], codon_dict[y]) if xaa == yaa: fxy = 1.0 else: pix = pis[r][xaa]**params['stringencyparameter'] piy = pis[r][yaa]**params['stringencyparameter'] if abs(pix - piy) < 1e-6: fxy = omega else: fxy = omega * math.log( piy / pix) / (1.0 - pix / piy) row.append(qxy * fxy * params['scalerate']) assert len(row) == len(codons) row[xi] = -sum(row) matrix.append(row) model = pyvolve.Model("custom", {"matrix": matrix}) partitions.append(pyvolve.Partition(models=model, size=1)) print("\nSimulating evolution, writing to {0}...".format( args['simulatedalignment'])) basename = os.path.splitext(args['simulatedalignment'])[0] evolver = pyvolve.Evolver(partitions=partitions, tree=tree) evolver( seqfile=args['simulatedalignment'], infofile='{0}_infofile.txt'.format(basename), ratefile='{0}_ratefile.txt'.format(basename), ) print("Finished simulation") uniqueseqs = set([]) uniquealignment = [] ninitial = 0 for seq in Bio.SeqIO.parse(args['simulatedalignment'], 'fasta'): ninitial += 1 seqstr = str(seq.seq) if seqstr not in uniqueseqs: uniqueseqs.add(seqstr) uniquealignment.append(seq) print( "\nAfter removing redundant sequences, we have shrunk {0} from {1} to {2} sequences" .format(args['simulatedalignment'], ninitial, len(uniquealignment))) Bio.SeqIO.write(uniquealignment, args['simulatedalignment'], 'fasta')
rates[j] = float(rates[j]) ############### Loop ########## for species in species_numbers: print species check_dir(path.join(outdir,species)) os.chdir(path.join(outdir,species)) for size in sizes: print size check_dir(path.join(outdir, species, size)) os.chdir(path.join(outdir, species, size)) tree = path.join(treedir, species, size, "tree_file") current_tree = pyvolve.read_tree(file = tree) for i in range(1,n_runs+1): check_dir(path.join(outdir, species, size, str(i))) os.chdir(path.join(outdir, species, size, str(i))) my_model = pyvolve.Model("codon", {"alpha":alphas, "beta":betas, "kappa":kappa}, rate_probs=rates) my_partition = pyvolve.Partition(models = my_model, size = n_sites) my_evolver = pyvolve.Evolver(partitions = my_partition, tree = current_tree) my_evolver()
if __name__ == "__main__": usage =''' python pyvolve-genseq.py <tree.nwk> <seq-size> [<scale> default=1 (no scale)] ''' if len(sys.argv) < 3: sys.exit(usage) tree_f = sys.argv[1] outfiles = tree_f size = sys.argv[2] scale = 1 scale = float(sys.argv[3]) if len(sys.argv) > 3 else None print("Reading tree..") my_tree = pyvolve.read_tree(file = tree_f, scale_tree=scale) my_model = pyvolve.Model("nucleotide") my_partition = pyvolve.Partition(models = my_model, size = int(size)) print("Simulating sequences..") my_evolver = pyvolve.Evolver(tree = my_tree, partitions = my_partition) my_evolver(ratefile = "%s.%s.ratefile.txt" % (outfiles, size), infofile = "%s.%s.infofile.txt" % (outfiles, size), seqfile = "%s.%s.seqfile.fasta" % (outfiles, size) ) print("Tree info..") tree_distances_info(tree_f, scale, int(size)) print("Running ANI on sequences..") pyani_seq("%s.%s.seqfile.fasta" % (outfiles, size))
def main(): """Main body of script.""" codons = pyvolve.genetics.Genetics().codons codon_dict = pyvolve.genetics.Genetics().codon_dict pyrims = pyvolve.genetics.Genetics().pyrims purines = pyvolve.genetics.Genetics().purines args = vars(ParseArguments().parse_args()) print("Read the following command line arguments:") print("\n\t{0}".format("\n\t".join(["{0} = {1}".format(key, value) for (key, value) in args.items()]))) print("\nPerforming simulation with pyvolve version {0}".format(pyvolve.__version__)) print("\nReading model params from {0}".format(args['modelparams'])) params = ReadParams(args['modelparams']) for (param, paramvalue) in params.items(): print("The value of {0} is {1}".format(param, paramvalue)) print("\nReading preferences from {0}".format(args['prefs'])) tup = dms_tools.file_io.ReadPreferences(args['prefs']) (sites, pis) = (tup[0], tup[2]) print("\nRead amino-acid preferences for {0} sites".format(len(pis))) tree = pyvolve.read_tree(file=args['tree']) # create models for simulation partitions = [] for r in sites: if params['diversifyingsitesA'] and (int(r) in params['diversifyingsitesA']): omega = params['diversifyingomegaA'] print r,omega elif params['diversifyingsitesB'] and (int(r) in params['diversifyingsitesB']): omega = params['diversifyingomegaB'] print r,omega else: omega = 1.0 matrix = [] # matrix[x][y] is rate of substitution from x to y for (xi, x) in enumerate(codons): row = [] for (yi, y) in enumerate(codons): ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]] if len(ntdiffs) == 0: assert x == y row.append(0) # will later be adjusted to make row sum to zero elif len(ntdiffs) > 1: # multi-nucleotide codon change row.append(0) else: # single nucleotide change (xnt, ynt) = ntdiffs[0] if (xnt in purines) == (ynt in purines): # transition qxy = params['kappa'] * params['phi{0}'.format(ynt)] else: # transversion qxy = params['phi{0}'.format(ynt)] (xaa, yaa) = (codon_dict[x], codon_dict[y]) if xaa == yaa: fxy = 1.0 else: pix = pis[r][xaa]**params['stringencyparameter'] piy = pis[r][yaa]**params['stringencyparameter'] if abs(pix - piy) < 1e-6: fxy = omega else: fxy = omega * math.log(piy / pix) / (1.0 - pix / piy) row.append(qxy * fxy * params['scalerate']) assert len(row) == len(codons) row[xi] = -sum(row) matrix.append(row) model = pyvolve.Model("custom", {"matrix":matrix}) partitions.append(pyvolve.Partition(models=model, size=1)) print("\nSimulating evolution, writing to {0}...".format(args['simulatedalignment'])) basename = os.path.splitext(args['simulatedalignment'])[0] evolver = pyvolve.Evolver(partitions=partitions, tree=tree) evolver( seqfile=args['simulatedalignment'], infofile='{0}_infofile.txt'.format(basename), ratefile='{0}_ratefile.txt'.format(basename), ) print("Finished simulation") uniqueseqs = set([]) uniquealignment = [] ninitial = 0 for seq in Bio.SeqIO.parse(args['simulatedalignment'], 'fasta'): ninitial += 1 seqstr = str(seq.seq) if seqstr not in uniqueseqs: uniqueseqs.add(seqstr) uniquealignment.append(seq) print("\nAfter removing redundant sequences, we have shrunk {0} from {1} to {2} sequences".format(args['simulatedalignment'], ninitial, len(uniquealignment))) Bio.SeqIO.write(uniquealignment, args['simulatedalignment'], 'fasta')
def test_branchScale(self): """Simulate evolution, ensure scaled branches match number of subs.""" scipy.random.seed(1) random.seed(1) # define model, only free parameter is mu for testing simulations nsites = 50 prefs = [] minpref = 0.01 for r in range(nsites): rprefs = scipy.random.dirichlet([1] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) kappa = 4.2 omega = 0.4 beta = 1.5 mu = 0.3 if self.MODEL == phydmslib.models.ExpCM: phi = scipy.random.dirichlet([7] * N_NT) model = phydmslib.models.ExpCM(prefs, kappa=kappa, omega=omega, beta=beta, mu=mu, phi=phi, freeparams=['mu']) partitions = phydmslib.simulate.pyvolvePartitions(model) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: g = scipy.random.dirichlet([7] * N_NT) model = phydmslib.models.ExpCM_empirical_phi(prefs, g, kappa=kappa, omega=omega, beta=beta, mu=mu, freeparams=['mu']) partitions = phydmslib.simulate.pyvolvePartitions(model) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = scipy.asarray( [scipy.random.dirichlet([7] * N_NT) for i in range(3)]) model = phydmslib.models.YNGKP_M0(e_pw, nsites) partitions = phydmslib.simulate.pyvolvePartitions(model) else: raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL))) # tree is two sequences separated by a single branch t = 0.04 / model.branchScale newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0) pyvolvetree = pyvolve.read_tree(tree=newicktree) temptree = '_temp.tree' with open(temptree, 'w') as f: f.write(newicktree) biotree = Bio.Phylo.read(temptree, 'newick') os.remove(temptree) # Simulate evolution of two sequences separated by a long branch. # Then estimate subs per site in a heuristic way that will be # roughly correct for short branches. Do this all several times # and average results to get better accuracy. alignment = '_temp_branchScale_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) nsubs = 0 # subs in simulated seqs (estimate from Hamming distance) treedist = 0.0 # distance inferred by `TreeLikelihood` nreplicates = 100 for i in range(nreplicates): evolver(seqfile=alignment, infofile=info, ratefile=rates) a = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] assert len(a[0][1]) == len(a[1][1]) == nsites * 3 for f in [alignment, info, rates]: if os.path.isfile(f): os.remove(f) for r in range(nsites): codon1 = a[0][1][3 * r:3 * r + 3] codon2 = a[1][1][3 * r:3 * r + 3] nsubs += len([j for j in range(3) if codon1[j] != codon2[j]]) tl = phydmslib.treelikelihood.TreeLikelihood(biotree, a, model) tl.maximizeLikelihood() treedist += sum([n.branch_length for n in tl.tree.get_terminals()]) nsubs /= float(nsites * nreplicates) treedist /= float(nreplicates) # We expect nsubs = branchScale * t, but build in some tolerance # with rtol since we simulated finite number of sites. self.assertTrue( scipy.allclose(nsubs, model.branchScale * t, rtol=0.2), ("Simulated subs per site of {0} is not close " "to expected value of {1} (branchScale = {2}, t = {3})").format( nsubs, t * model.branchScale, model.branchScale, t)) self.assertTrue( scipy.allclose(treedist, nsubs, rtol=0.2), ("Simulated subs per site of {0} is not close to inferred " "branch length of {1}").format(nsubs, treedist))
# Usage example: /Users/fengqian/anaconda2/bin/python /Users/fengqian/Downloads/simulated_seqs.py /Users/fengqian/Downloads/UniMelb_shared-master/project/mosaic_data/Protein_translateable_pilot_upper_centroids.fasta /Users/fengqian/simulated_tree.txt /Users/fengqian/simulated_seqs.fasta ####################################################################### import sys, os import pyvolve import glob from mungo.fasta import FastaReader from collections import defaultdict input_fasta = sys.argv[1] input_tree_txt = sys.argv[2] output_seqs = sys.argv[3] #f = pyvolve.ReadFrequencies("amino_acid", file = "/Users/fengqian/Downloads/UniMelb_shared-master/project/mosaic_data/Protein_translateable_pilot_upper_centroids.fasta") #f = pyvolve.ReadFrequencies("amino_acid", file = "/data/cephfs/punim0609/qian_feng/snake_pipeline/data/Protein_translateable_pilot_upper_centroids.fasta") f = pyvolve.ReadFrequencies("amino_acid", file=input_fasta) frequencies = f.compute_frequencies() my_tree_1 = pyvolve.read_tree(file=input_tree_txt, scale_tree=0.5) my_model_1 = pyvolve.Model("MTMAM", {"state_freqs": frequencies}) my_partition_1 = pyvolve.Partition(models=my_model_1, size=200) my_evolver_1 = pyvolve.Evolver(partitions=my_partition_1, tree=my_tree_1) my_evolver_1(ratefile=None, infofile=None, seqfile=output_seqs) seqs = {} seq_list = [] count = 0 for h, s in FastaReader(output_seqs): seqs["seq" + str(count)] = s seq_list.append("seq" + str(count)) count += 1 ##organize the seq ID name with open(output_seqs, 'w') as outfile: for s in seq_list:
def setUp(self): """Set up parameters for test.""" random.seed(1) scipy.random.seed(1) self.underflowfreq = 1 # define tree self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;') tempfile = '_temp.tree' with open(tempfile, 'w') as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, 'newick') os.remove(tempfile) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nsites = 50 self.nseqs = self.tree.count_terminals() e_pw = scipy.ndarray((3, N_NT), dtype='float') e_pw.fill(0.25) yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites) partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0) alignment = '_temp_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs # define model prefs = [] minpref = 0.02 g = scipy.random.dirichlet([5] * N_NT) for r in range(self.nsites): rprefs = scipy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure: divpressure = scipy.random.uniform(-1, 5, self.nsites) divpressure /= max(abs(divpressure)) self.model = phydmslib.models.ExpCM_empirical_phi_divpressure( prefs, g, divpressure) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = scipy.random.uniform(0.2, 0.8, size=(3, N_NT)) e_pw = e_pw / e_pw.sum(axis=1, keepdims=True) self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))
# This example script demonstrates how to evolve according to a nucleotide model with sitewise rate heterogeneity. import pyvolve # Define a phylogeny, from a file containing a newick tree my_tree = pyvolve.read_tree(file = "file_with_tree.tre") # Define a nucleotide model, as a pyvolve.Model object. For this example, we'll use default parameters, but see the example script custom_nucleotide.py for other options # To implement rate heterogeneity, do either of these: ## 1) Custom rates: Provide a list of rate_factors when defining a Model object. These rate factors will be assigned to sites with equal probability by default. To change this, provide probabilities with the argument `rate_probs`. ## 2) Gamma rates: Provide the keyword arguments num_categories and alpha when defining a Model object. <num_categories> rates will be drawn from a gamma distribution with shape and scale parameter each equal to <alpha>. These rates will be equiprobable, unless overridden by `rate_probs`. # Several model definitions are shown below: # custom rates my_model1 = pyvolve.Model("nucleotide", rate_factors = [0.3, 0.8, 1.5, 2.45] ) # 25% of sites will have each factor. my_model2 = pyvolve.Model("nucleotide", rate_factors = [0.3, 0.8, 1.5, 2.45], rate_probs = [0.7, 0.2, 0.05, 0.05] ) # 70% of sites evolve with rate of 0.3, 20% with a rate of 0.8, 5% with a rate of 1.5, and 5% with a rate of 2.45 # gamma rates my_model3 = pyvolve.Model("nucleotide", alpha = 0.4, num_categories = 3) # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions my_partition = pyvolve.Partition(models = my_model2, size = 250) # Evolve! my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree) my_evolver()
# This example script demonstrates how to evolve according to a nucleotide model with *branch* rate heterogeneity. The approach is the same for non-nucleotide models. import pyvolve # Define a phylogeny. For clarity, we define this tree with a string. The tree contains model flags for branches which should evolve according to new models. Flags are represented as _name_, where underscores surround the name. my_tree = pyvolve.read_tree(tree="((t1:0.5, t2:0.5):0.5_m1_,(t3:0.5, t4:0.5):0.5_m2_));") # Define a model for each flag. Models should be given names with the keyword argument `name`. These names *MUST* have correspondingly named flags in the tree! model1 = pyvolve.Model("nucleotide", {"kappa": 3.5}, name="m1") model2 = pyvolve.Model("nucleotide", {"kappa": 4.75}, name="m2") rootmodel = pyvolve.Model( "nucleotide", name="root" ) # We can also define, if we want, a model for the ROOT of the tree that is separate from either of these models. # Define partition will all models as a list. Include the argument `root_model_name` to indicate the NAME ATTRIBUTE of the model that should be used at the root of the tree. This name's corresponding object must be in the `models` list. Note that a separate root model is not needed - you could easily just start with _m1_ at the root, but you'd still need to give "m1" to `root_model_name`. my_partition = pyvolve.Partition(models=[model1, model2, rootmodel], size=250, root_model_name="root") # Evolve! my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree) my_evolver()
def setUp(self): """Set up for tests.""" scipy.random.seed(1) random.seed(1) nsites = 1 minpref = 0.001 self.prefs = [] self.realprefs = [] for r in range(nsites): rprefs = scipy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) scipy.random.shuffle(rprefs) self.realprefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) self.kappa = 3.0 self.omega = 3.0 self.phi = scipy.random.dirichlet([5] * N_NT) self.model = self.MODEL(self.prefs, prior=None, kappa=self.kappa, omega=self.omega, phi=self.phi) self.realmodel = phydmslib.models.ExpCM(self.realprefs, kappa=self.kappa, omega=self.omega, mu=10.0, phi=self.phi) treefile = os.path.abspath( os.path.join(os.path.dirname(__file__), './NP_data/NP_tree.newick')) self.tree = Bio.Phylo.read(treefile, 'newick') self.tree.root_at_midpoint() # simulate alignment using realmodel evolver = pyvolve.Evolver( partitions=phydmslib.simulate.pyvolvePartitions(self.realmodel), tree=pyvolve.read_tree(file=treefile)) alignmentfile = '_temp_fitprefs_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver(seqfile=alignmentfile, infofile=info, ratefile=rates) self.alignment = phydmslib.file_io.ReadCodonAlignment( alignmentfile, True) assert len(self.alignment[0][1]) == nsites * 3 for f in [alignmentfile, info, rates]: os.remove(f) self.codoncounts = dict([(r, dict([(INDEX_TO_CODON[c], 0) for c in range(N_CODON)])) for r in range(nsites)]) self.aacounts = dict([(r, dict([(a, 0) for a in range(N_AA)])) for r in range(nsites)]) for (head, seq) in self.alignment: self.codoncounts[r][seq] += 1 self.aacounts[r][CODON_TO_AA[CODON_TO_INDEX[seq]]] += 1 self.tl = phydmslib.treelikelihood.TreeLikelihood( self.tree, self.alignment, self.model)
def setUp(self): """Set up parameters for test.""" random.seed(1) scipy.random.seed(1) # define tree self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;') tempfile = '_temp.tree' with open(tempfile, 'w') as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, 'newick') os.remove(tempfile) self.brlen = {} for (name, brlen) in re.findall(r'(?P<name>node\d):(?P<brlen>\d+\.\d+)', self.newick): if name != self.tree.root.name: i = name[-1] # node number self.brlen[int(i)] = float(brlen) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nsites = 60 self.nseqs = self.tree.count_terminals() e_pw = scipy.ndarray((3, N_NT), dtype='float') e_pw.fill(0.25) yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites) partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0) alignment = '_temp_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs self.codons = {} # indexed by node, site, gives codon index for node in self.tree.get_terminals(): node = node.name i = int(node[-1]) self.codons[i] = {} seq = [seq for (head, seq) in self.alignment if node == head][0] for r in range(self.nsites): codon = seq[3 * r:3 * r + 3] self.codons[i][r] = CODON_TO_INDEX[codon] # define model prefs = [] minpref = 0.02 g = scipy.random.dirichlet([5] * N_NT) g[g < 0.1] = 0.1 g /= g.sum() for r in range(self.nsites): rprefs = scipy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure: divpressure = scipy.random.uniform(-1, 5, self.nsites) divpressure /= max(abs(divpressure)) self.model = phydmslib.models.ExpCM_empirical_phi_divpressure( prefs, g, divpressure) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = scipy.random.uniform(0.2, 0.8, size=(3, N_NT)) e_pw = e_pw / e_pw.sum(axis=1, keepdims=True) self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedBetaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))