def finalize():
        if GC.random_number_seed is not None:
            from warnings import warn
            warn(
                "random_number_seed specified, but Pyvolve does not support seeding its random generator"
            )
        makedirs("pyvolve_output", exist_ok=True)
        label_to_node = MF.modules['TreeNode'].label_to_node()
        for root, treestr in GC.pruned_newick_trees:
            # run Pyvolve
            treestr = treestr.strip()
            label = root.get_label()
            rootseq = root.get_seq()
            if GC.VERBOSE:
                print('[%s] Pyvolve evolving sequences on tree: %s' %
                      (datetime.now(), treestr),
                      file=stderr)
                print('[%s] Pyvolve root sequence: %s' %
                      (datetime.now(), rootseq),
                      file=stderr)
            if treestr != '(':
                treestr = '(%s);' % treestr[:-1]
            try:
                tree = pyvolve.read_tree(tree=treestr)
                partition = pyvolve.Partition(models=GC.pyvolve_model,
                                              root_sequence=rootseq)
                evolver = pyvolve.Evolver(partitions=partition, tree=tree)
            except NameError:
                import pyvolve
                tree = pyvolve.read_tree(tree=treestr)
                partition = pyvolve.Partition(models=GC.pyvolve_model,
                                              root_sequence=rootseq)
                evolver = pyvolve.Evolver(partitions=partition, tree=tree)
            except AssertionError:
                assert False, "Error setting up Pyvolve. Tree: %s" % treestr
            ratefile = "pyvolve_output/%s_ratefile.txt" % label  # set each to None to not generate these files
            infofile = "pyvolve_output/%s_infofile.txt" % label
            seqfile = "pyvolve_output/%s_seqfile.fasta" % label
            evolver(ratefile=ratefile, infofile=infofile, seqfile=seqfile)
            seqs = evolver.get_sequences(
            )  # use anc=True to get internal sequences as well

            # store leaf sequences in GlobalContext
            if not hasattr(
                    GC, 'final_sequences'
            ):  # GC.final_sequences[cn_node][t] = set of (label,seq) tuples
                GC.final_sequences = {}
            for leaf in seqs:
                seq = seqs[leaf]
                virus_label, cn_label, sample_time = leaf.split('|')
                sample_time = float(sample_time)
                if cn_label not in GC.final_sequences:
                    GC.final_sequences[cn_label] = {}
                if sample_time not in GC.final_sequences[cn_label]:
                    GC.final_sequences[cn_label][sample_time] = []
                GC.final_sequences[cn_label][sample_time].append((leaf, seq))
Exemplo n.º 2
0
    def run_u(self, tree_file, sequences_folder):

        with open(tree_file) as f:
            line = f.readline().strip()
            if "(" not in line or line == ";":
                return None
            else:
                my_tree = ete3.Tree(line, format=1)

        root = my_tree.get_tree_root()
        root.name = "Root"

        # in this case we need to read the multipliers
        # First we apply the multipliers per family
        # Second, the multipliers per species tree branch

        gf_multiplier = self.gf_multipliers[tree_file.split("_")[-2].split("/")[-1]]

        for node in my_tree.traverse():
            node.dist = node.dist * gf_multiplier * self.st_multipliers[node.name.split("_")[0]]

        tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"])
        name_mapping = self.get_mapping_internal_names(tree, my_tree)
        partition = pyvolve.Partition(models=self.model, size=self.size)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)
        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_") +  "complete.fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)
        # Correct the names
        self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
Exemplo n.º 3
0
def evolve(newicks, sequence_size, scale_tree):
    temp = "temporary_sequences.fasta"
    phy_files = []
    my_model = pyvolve.Model("nucleotide")
    partition = pyvolve.Partition(models = my_model, size = sequence_size)
    for i in range(0, len(newicks)):

        newick = newicks[i]
        tree = pyvolve.read_tree(tree = newick, scale_tree = scale_tree)
        my_evolver = pyvolve.Evolver(tree = tree, partitions = partition)
        fasta_seqfile = "temp" + str(i) + ".fasta"
        phylip_seqfile = "temp" + str(i) + ".phyl"
        phy_files.append(phylip_seqfile)

        my_evolver(seqfile=fasta_seqfile, seqfmt = "fasta", ratefile = None, infofile = None)
        fasta_to_phyl(fasta_seqfile, phylip_seqfile)

        os.remove(fasta_seqfile)

    phyl_output = "temp_seq.phyl"

    with open(phyl_output, 'w') as outfile:
        for fname in phy_files:
            with open(fname) as infile:
                outfile.write(infile.read())
                outfile.write("\n")
            os.remove(fname)

    return phyl_output
def get_random_tree(filename, tree_string, L, kappa):

	# strains = read_in_strains(filename)
	# # L = genome_length(strains)
	# min_m = get_min_m(strains, L)
	# scaled_tree_string = scale_newick_format_tree(strains, L, min_m, tree_string)

	phylogeny = pyvolve.read_tree(tree = tree_string)
	# pyvolve.print_tree(phylogeny)

	freqs = [0.25,0.25,0.25,0.25]

	nuc_model = pyvolve.Model('nucleotide', {'kappa':kappa, 'state_freqs':freqs})

	ancestor = generate_ancestor(L)
	print(ancestor)

	my_partition = pyvolve.Partition(models = nuc_model, root_sequence = ancestor)

	my_evolver = pyvolve.Evolver(partitions = my_partition, tree = phylogeny)
	my_evolver() 
	# my_evolver(write_anc = True)
	simulated_strains = my_evolver.get_sequences()
	# strains = my_evolver.get_sequences(anc = True)
	# strain_names = list(strains.keys())
	pi = pi_value(simulated_strains)
	theta = theta_value(simulated_strains)

	# print('pi: ' + str(pi))
	# print('theta: ' + str(theta))

	return {'pi': pi, 'theta': theta}


	
Exemplo n.º 5
0
def make_partition_set(cat_sizes, root_freq_set, model_set, model_assignment):
    if root_freq_set is None:
        return [
            pyvolve.Partition(models=ms, size=nk, root_model_name="bp0")
            for (ms, nk) in it.izip(model_set, cat_sizes)
        ]
    else:
        root_seqs = [
            ''.join(np.random.choice(MOLECULES.codons, size=nk, p=freqs).flat)
            for (nk, freqs) in zip(cat_sizes, root_freq_set)
        ]
        return [
            pyvolve.Partition(models=ms,
                              root_sequence=root,
                              root_model_name="bp0")
            for (ms, root) in it.izip(model_set, root_seqs)
        ]
Exemplo n.º 6
0
def simulate_genomes(model, tree, asize, outdir, number):
    path = mkdir(os.path.join(outdir, str(number)))
    partition = pyvolve.Partition(models=model, size=asize)
    evolver = pyvolve.Evolver(tree=tree, partitions=partition)
    evolver(
        seqfile=None,  # ,
        ratefile=os.path.join(path, "rate_{}.fasta".format(number)),
        infofile=None)
    return evolver.get_sequences()
Exemplo n.º 7
0
def get_random_tree(L, species, scaled_tree_string, kappa, iteration):
    # strains = read_in_strains(filename)
    # L = genome_length(strains)
    # min_m = get_min_m(strains, L)
    # max_m = get_max_m(strains, L, tree_string)
    # pis = []
    # thetas = []

    # scaled_trees = []

    # for x in range(min_m,max_m+1):
    # 	scaled_tree_string = scale_newick_format_tree(strains, L, x, tree_string, increment)
    # 	scaled_trees.append(scaled_tree_string)

    # for tree in scaled_trees:
    phylogeny = pyvolve.read_tree(tree=scaled_tree_string)
    print('read in the tree')
    pyvolve.print_tree(phylogeny)

    freqs = [0.25, 0.25, 0.25, 0.25]

    nuc_model = pyvolve.Model('nucleotide', {
        'kappa': kappa,
        'state_freqs': freqs
    })

    ancestor = generate_ancestor(L)
    print('generated an ancestor')
    # 	# print(ancestor)

    my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor)

    my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny)
    my_evolver(ratefile=None,
               infofile=None,
               seqfile="simulated_alignment_" + str(species[:-1]) +
               "_universal_" + str(iteration + 1) + ".fasta")
    # 	# my_evolver()
    print('evolved the sequences')
    # 	# my_evolver(write_anc = True)
    simulated_strains = my_evolver.get_sequences()
    # 	# strains = my_evolver.get_sequences(anc = True)
    # 	# strain_names = list(strains.keys())
    pi = pi_value(simulated_strains)
    theta = theta_value(simulated_strains)
    # 	pis.append(pi)
    # 	thetas.append(theta)

    # # print('pi: ' + str(pi))
    # # print('theta: ' + str(theta))

    # return {'pi': pis, 'theta': thetas}

    return pi, theta
Exemplo n.º 8
0
def simulate(f, seqfile, tree, mu_dict, length):
    ''' Simulate single partition according homogeneous mutation-selection model.
    '''

    try:
        my_tree = pyvolve.read_tree(file=tree)
    except:
        my_tree = pyvolve.read_tree(tree=tree)

    model = pyvolve.Model("MutSel", {'state_freqs': f, 'mu': mu_dict})

    part = pyvolve.Partition(size=length, models=model)
    e = pyvolve.Evolver(partitions=part, tree=my_tree)
    e(seqfile=seqfile, ratefile=None, infofile=None)
Exemplo n.º 9
0
def get_accurate_c(L, kappa):

    ancestor = generate_ancestor(L)
    print(ancestor)

    # phylogeny = pyvolve.read_tree(tree = '(  (t1:0.5,t2:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 ,  (t5:0.5,t6:0.5)i3:0.5, (t7:0.5,t8:0.5)i4:0.5  ) root;')
    phylogeny = pyvolve.read_tree(
        tree=
        '(  ((t7:0.5,t8:0.5)i4:0.5,(t5:0.5,t6:0.5)i3:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5  ) root;'
    )

    pyvolve.print_tree(phylogeny)

    freqs = [0.25, 0.25, 0.25, 0.25]

    nuc_model = pyvolve.Model('nucleotide', {
        'kappa': 1.86836732388,
        'state_freqs': freqs
    })

    my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor)

    my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny)
    # my_evolver()
    my_evolver(write_anc=True)
    # strains = my_evolver.get_sequences()
    strains = my_evolver.get_sequences(anc=True)
    strain_names = list(strains.keys())  # pre-order traversal of the tree
    n = len(strain_names)

    print(strain_names)

    c_sites = {}
    for key in strain_names:
        c_sites[key] = []

    site_counts = L * [
        None
    ]  # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide
    strains_with_site = L * [
        None
    ]  # list of the strains that have a convergent mutation at each site; index = site
    for x in range(L):
        site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
        strains_with_site[x] = []
    # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide

    c = 0
    strain_names
Exemplo n.º 10
0
def execute(tree, model, length, out, numSim):

    # read in model, tree, and define partition
    pyvolveModel = pyvolve.Model(model)
    pyvolveTree = pyvolve.read_tree(file=tree)
    pyvolvePartition = pyvolve.Partition(models=pyvolveModel, size=int(length))

    # create evolver
    my_evolver = pyvolve.Evolver(tree=pyvolveTree, partitions=pyvolvePartition)
    my_evolver()

    print("Simulating sequences...")
    # create simluated sequences
    for i in range(int(numSim)):
        print(str(out) + "." + str(i) + ".fa")
        my_evolver(seqfile=str(out) + "." + str(model) + "-" + str(i) + ".fa")
Exemplo n.º 11
0
    def run(self, tree_file, sequences_folder):

        with open(tree_file) as f:

            line = f.readline().strip()
            if "(" not in line or line == ";":
                return None
            else:
                my_tree = ete3.Tree(line, format=1)

        tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"])
        name_mapping = self.get_mapping_internal_names(tree, my_tree)
        partition = pyvolve.Partition(models=self.model, size=self.size)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)
        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)

        # Correct the names
        self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
Exemplo n.º 12
0
    def simulate_single_sequence(self, name, gene_length, tree_file, sequences_folder):

        my_tree = "(A:1,B:1);".replace("A",name)
        tree = pyvolve.read_tree(tree=my_tree)
        partition = pyvolve.Partition(models=self.model, size=gene_length)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)

        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)

        # Select single sequence

        entries = list()

        for n, v in af.fasta_reader(os.path.join(sequences_folder, fasta_file)):
            if n[1:] != name:
                continue
            else:
                entries.append((n,v))
        af.fasta_writer(os.path.join(sequences_folder, fasta_file), entries)
Exemplo n.º 13
0
def exampleFastaGenerator(nwkFile, fastaOutputLocation, seqLength, rate=1):
    # Tree.
    treeName = nwkFile[nwkFile.rindex('/'):]
    treeName = treeName.split('.')[0]
    phylogony = pyvolve.read_tree(file=nwkFile)
    # Rates.
    mutationRates = {
        "AC": rate,
        "AG": rate,
        "AT": rate,
        "CG": rate,
        "CT": rate,
        "GT": rate
    }
    # Model.
    model = pyvolve.Model("nucleotide", {"mu": mutationRates})
    partition = pyvolve.Partition(models=model, size=seqLength)
    # Evolver.
    evolver = pyvolve.Evolver(partitions=[partition], tree=phylogony)
    evolver(seqfile=fastaOutputLocation, ratefile=None, infofile=None)
Exemplo n.º 14
0
def evolve_nonconvergent_partition(g):
    if (g['num_convergent_site'] == 0):
        site_start = 1
    else:
        site_start = g['num_simulated_site'] - g['num_convergent_site'] + 1
    site_end = g['num_simulated_site']
    print('Codon site {}-{}; Non-convergent codons'.format(
        site_start, site_end))
    num_nonconvergent_site = g['num_simulated_site'] - g['num_convergent_site']
    q_matrix = copy.copy(g['background_Q'])
    with suppress_stdout_stderr():
        model = pyvolve.Model(model_type='custom',
                              name='root',
                              parameters={'matrix': q_matrix})
    partition = pyvolve.Partition(models=model, size=num_nonconvergent_site)
    evolver = pyvolve.Evolver(partitions=partition, tree=g['background_tree'])
    evolver(ratefile='tmp.csubst.simulate_nonconvergent_ratefile.txt',
            infofile='tmp.csubst.simulate_nonconvergent_infofile.txt',
            seqfile='tmp.csubst.simulate_nonconvergent.fa',
            write_anc=False)
Exemplo n.º 15
0
def generateTree(tns, ntaxa, seqlen):
    #Construct the tree and save as newick file
    t = dendropy.simulate.treesim.birth_death_tree(birth_rate=1.0, death_rate=0, taxon_namespace=tns, num_extant_tips=ntaxa)
    t.write(path='/tmp/pyvt', schema='newick', suppress_rooting=True, suppress_internal_node_labels=True)
    
    #Set pyvolve data type
    m1 = pyvolve.Model("nucleotide")
    p1 = pyvolve.Partition(models=m1, size=seqlen)
    
    #Read tree from dendropy
    pot = pyvolve.read_tree(file='/tmp/pyvt')
    
    #Simulate evolution with no save file
    e1 = pyvolve.Evolver(tree=pot, partitions=p1)
    e1(seqfile=None)
    
    seqs = e1.get_sequences()
    
    ds=dendropy.DnaCharacterMatrix.from_dict(seqs, taxon_namespace=tns)
    ds.write(path="evolvedsequences.fasta", schema="fasta")
    #print ds
    return t
Exemplo n.º 16
0
def simulate(tree_index,length):
    """
        Inputs: tree (integer 0-2)
        Outputs: array of 4 sequences, using the tree from above
    """
    tree_map = ["alpha","beta","charlie"]
    tree = tree_map[tree_index]
    my_tree = pyvolve.read_tree(file = "trees/"+tree+".tre")

    #Idk weird pyvolve paramets
    parameters_omega = {"omega": 0.65}
    parameters_alpha_beta = {"beta": 0.65, "alpha": 0.98} # Corresponds to dN/dS = 0.65 / 0.98
    my_model = pyvolve.Model("MG", parameters_alpha_beta)

    # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions (for a codon alignment, this means 250 codons, i.e. 750 nucleotide sites)
    my_partition = pyvolve.Partition(models = my_model, size = length)

    # Evolve!
    my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree, ratefile = None, infofile = None)
    my_evolver(ratefile = None, infofile = None)

    #Extract the sequences
    simulated_sequences = list(my_evolver.get_sequences().values())
    return simulated_sequences
Exemplo n.º 17
0
import sys, os
import pyvolve
import glob
from mungo.fasta import FastaReader
from collections import defaultdict
input_fasta = sys.argv[1]
input_tree_txt = sys.argv[2]
output_seqs = sys.argv[3]

#f = pyvolve.ReadFrequencies("amino_acid", file = "/Users/fengqian/Downloads/UniMelb_shared-master/project/mosaic_data/Protein_translateable_pilot_upper_centroids.fasta")
#f = pyvolve.ReadFrequencies("amino_acid", file = "/data/cephfs/punim0609/qian_feng/snake_pipeline/data/Protein_translateable_pilot_upper_centroids.fasta")
f = pyvolve.ReadFrequencies("amino_acid", file=input_fasta)
frequencies = f.compute_frequencies()
my_tree_1 = pyvolve.read_tree(file=input_tree_txt, scale_tree=0.5)
my_model_1 = pyvolve.Model("MTMAM", {"state_freqs": frequencies})
my_partition_1 = pyvolve.Partition(models=my_model_1, size=200)
my_evolver_1 = pyvolve.Evolver(partitions=my_partition_1, tree=my_tree_1)
my_evolver_1(ratefile=None, infofile=None, seqfile=output_seqs)

seqs = {}
seq_list = []
count = 0
for h, s in FastaReader(output_seqs):
    seqs["seq" + str(count)] = s
    seq_list.append("seq" + str(count))
    count += 1
##organize the seq ID name
with open(output_seqs, 'w') as outfile:
    for s in seq_list:
        outfile.write(">" + s + "\n" + seqs[s] + "\n")
Exemplo n.º 18
0
# Define a phylogeny, from a file containing a newick tree
my_tree = pyvolve.read_tree(file="file_with_tree.tre")

# Define a nucleotide model, as a pyvolve.Model object. For this example, we'll use default parameters, but see the example script custom_aminoacid.py for other options

# To implement rate heterogeneity, do either of these:
## 1) Custom rates: Provide a list of rate_factors when defining a Model object. These rate factors will be assigned to sites with equal probability by default. To change this, provide probabilities with the argument `rate_probs`.
## 2) Gamma rates: Provide the keyword arguments num_categories and alpha when defining a Model object. <num_categories> rates will be drawn from a gamma distribution with shape and scale parameter each equal to <alpha>. These rates will be equiprobable, unless overridden by `rate_probs`.

# Several model definitions are shown below (first argument can be a different model, as desired).

# custom rates
my_model1 = pyvolve.Model(
    "WAG", rate_factors=[0.3, 0.8, 1.5,
                         2.45])  # 25% of sites will have each factor.
my_model2 = pyvolve.Model(
    "WAG",
    rate_factors=[0.3, 0.8, 1.5, 2.45],
    rate_probs=[0.7, 0.2, 0.05, 0.05]
)  # 70% of sites evolve with rate of 0.3, 20% with a rate of 0.8, 5% with a rate of 1.5, and 5% with a rate of 2.45

# gamma rates
my_model3 = pyvolve.Model("WAG", alpha=0.6, num_categories=5)

# Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions
my_partition = pyvolve.Partition(models=my_model2, size=250)

# Evolve!
my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree)
my_evolver()
Exemplo n.º 19
0
@author: david
"""
import pyvolve

"User defined params"
mut_rate = 0.005
freqs = [0.25, 0.25, 0.25, 0.25]
seq_length = 1000
kappa = 2.75

"Read in phylogeny along which Pyvolve should simulate"
"Scale_tree sets absolute mutation rate"
my_tree = pyvolve.read_tree(file = "AMR-sim.tre", scale_tree = mut_rate)
#pyvolve.print_tree(my_tree) # Print the parsed phylogeny

"Specify nucleotide substitution model with custom rates"
#custom_mu = {"AC":0.5, "AG":0.25, "AT":1.23, "CG":0.55, "CT":1.22, "GT":0.47}
#nuc_model = pyvolve.Model( "nucleotide", {"mu":custom_mu, "state_freqs":freqs} )

"Or just use an HKY model with kappa"
nuc_model = pyvolve.Model( "nucleotide", {"kappa":kappa, "state_freqs":freqs})

"Define a Partition object which evolves set # of positions according to my_model"
my_partition = pyvolve.Partition(models = nuc_model, size = seq_length)
#my_partition = pyvolve.Partition(models = nuc_model, root_sequence = "GATAGAAC") # Or with a root seq

"Define an Evolver instance to evolve a single partition"
my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree) 

"Evolve sequences with custom file names"
my_evolver(ratefile = "AMR_ratefile.txt", infofile = "AMR_infofile.txt", seqfile = "AMR-seqsim.fasta" )
#!/bin/python3

import pyvolve ; import sys

tree_variable=sys.argv[1]
anc_seq_variable=sys.argv[2]
model_type=sys.argv[3]
omega_value=float(sys.argv[4])



# Simulation:

my_tree = pyvolve.read_tree(file = tree_variable)

my_model = pyvolve.Model(model_type, {"omega": omega_value })

my_partition = pyvolve.Partition(models = my_model, root_sequence = anc_seq_variable)

my_evolver = pyvolve.Evolver(tree = my_tree, partitions= my_partition)
my_evolver()   

#pyvolve.print_tree(tree_variable)
Exemplo n.º 21
0
def main():
    """Main body of script."""
    codons = pyvolve.genetics.Genetics().codons
    codon_dict = pyvolve.genetics.Genetics().codon_dict
    pyrims = pyvolve.genetics.Genetics().pyrims
    purines = pyvolve.genetics.Genetics().purines

    args = vars(ParseArguments().parse_args())
    print("Read the following command line arguments:")
    print("\n\t{0}".format("\n\t".join(
        ["{0} = {1}".format(key, value) for (key, value) in args.items()])))

    print("\nPerforming simulation with pyvolve version {0}".format(
        pyvolve.__version__))

    print("\nReading model params from {0}".format(args['modelparams']))
    params = ReadParams(args['modelparams'])
    for (param, paramvalue) in params.items():
        print("The value of {0} is {1}".format(param, paramvalue))

    print("\nReading preferences from {0}".format(args['prefs']))
    tup = dms_tools.file_io.ReadPreferences(args['prefs'])
    (sites, pis) = (tup[0], tup[2])
    print("\nRead amino-acid preferences for {0} sites".format(len(pis)))

    tree = pyvolve.read_tree(file=args['tree'])

    # create models for simulation
    partitions = []
    for r in sites:
        if params['diversifyingsitesA'] and (int(r)
                                             in params['diversifyingsitesA']):
            omega = params['diversifyingomegaA']
            print r, omega
        elif params['diversifyingsitesB'] and (
                int(r) in params['diversifyingsitesB']):
            omega = params['diversifyingomegaB']
            print r, omega
        else:
            omega = 1.0
        matrix = []  # matrix[x][y] is rate of substitution from x to y
        for (xi, x) in enumerate(codons):
            row = []
            for (yi, y) in enumerate(codons):
                ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]]
                if len(ntdiffs) == 0:
                    assert x == y
                    row.append(
                        0)  # will later be adjusted to make row sum to zero
                elif len(ntdiffs) > 1:
                    # multi-nucleotide codon change
                    row.append(0)
                else:
                    # single nucleotide change
                    (xnt, ynt) = ntdiffs[0]
                    if (xnt in purines) == (ynt in purines):
                        # transition
                        qxy = params['kappa'] * params['phi{0}'.format(ynt)]
                    else:
                        # transversion
                        qxy = params['phi{0}'.format(ynt)]
                    (xaa, yaa) = (codon_dict[x], codon_dict[y])
                    if xaa == yaa:
                        fxy = 1.0
                    else:
                        pix = pis[r][xaa]**params['stringencyparameter']
                        piy = pis[r][yaa]**params['stringencyparameter']
                        if abs(pix - piy) < 1e-6:
                            fxy = omega
                        else:
                            fxy = omega * math.log(
                                piy / pix) / (1.0 - pix / piy)
                    row.append(qxy * fxy * params['scalerate'])
            assert len(row) == len(codons)
            row[xi] = -sum(row)
            matrix.append(row)
        model = pyvolve.Model("custom", {"matrix": matrix})
        partitions.append(pyvolve.Partition(models=model, size=1))

    print("\nSimulating evolution, writing to {0}...".format(
        args['simulatedalignment']))
    basename = os.path.splitext(args['simulatedalignment'])[0]
    evolver = pyvolve.Evolver(partitions=partitions, tree=tree)
    evolver(
        seqfile=args['simulatedalignment'],
        infofile='{0}_infofile.txt'.format(basename),
        ratefile='{0}_ratefile.txt'.format(basename),
    )
    print("Finished simulation")

    uniqueseqs = set([])
    uniquealignment = []
    ninitial = 0
    for seq in Bio.SeqIO.parse(args['simulatedalignment'], 'fasta'):
        ninitial += 1
        seqstr = str(seq.seq)
        if seqstr not in uniqueseqs:
            uniqueseqs.add(seqstr)
            uniquealignment.append(seq)
    print(
        "\nAfter removing redundant sequences, we have shrunk {0} from {1} to {2} sequences"
        .format(args['simulatedalignment'], ninitial, len(uniquealignment)))
    Bio.SeqIO.write(uniquealignment, args['simulatedalignment'], 'fasta')
Exemplo n.º 22
0
# This example script demonstrates how to evolve according to custom model with custom code

import pyvolve
import numpy as np

# Define a phylogeny, from a file containing a newick tree
my_tree = pyvolve.read_tree(file="file_with_tree.tre")

# Define a custom model with custom matrix and custom code (states). The matrix must be square and have the same dimension (in 1D) as the provided code. Note that code is a list because, in theory, you can specify multi-character (as in letters) states.
matrix = np.array([[-0.5, 0.25, 0.25], [0.25, -0.5, 0.25], [0.25, 0.25, -0.5]])
code = ["0", "1", "2"]
my_model = pyvolve.Model("custom", {"matrix": matrix, "code": code})

my_partition = pyvolve.Partition(models=my_model, size=1)

my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree)
my_evolver()
Exemplo n.º 23
0
sum = 0.0
for i in categoryProbs:
    sum += i
for i in range(nCat):
    categoryProbs[i] = categoryProbs[i] / sum
if sum > 1.000001 or sum < 0.999999:
    print(
        "\n Normalizing probabilities of site categories. New probabilities:")
    print(categoryProbs)

#run pyvolve

print("Starting pyvolve timer")
import pyvolve
start = time.time()
pyvolveTree = pyvolve.read_tree(file=pathSimu + treeFile,
                                scale_tree=args.scale)
#pyvolveTree = pyvolve.read_tree(tree = tString2, scale_tree = args.scale)
nucModel = pyvolve.Model("custom", {"matrix": mutMatrix},
                         alpha=0.5,
                         num_categories=len(categoryRates))
partitions = pyvolve.Partition(models=nucModel, root_sequence=ref)
my_evolver = pyvolve.Evolver(tree=pyvolveTree, partitions=partitions)
my_evolver(seqfile=pathSimu + outputFile,
           algorithm=1)  # Algorithm = 1 uses the Gillespie algorithm.
time2 = time.time() - start
print("Pyvolve timer ended")
print(time2)

exit()
# This example script demonstrates how to evolve according to a nucleotide model with several partitions.
# In this example, the first partition has gamma-distributedsitewise rate heterogeneity, the second partition is homogenous, and the third partition has custom sitewise rate heterogeneity.
# All models use default mutation-rate parameters

import pyvolve

# Define a phylogeny, from a file containing a newick tree
my_tree = pyvolve.read_tree(file="file_with_tree.tre")

# Define first model and partition. This partition has a length of 50 positions
model1 = pyvolve.Model("nucleotide", alpha=0.7, num_categories=4)
part1 = pyvolve.Partition(models=model1, size=50)

# Define second model and partition. This partition has a length of 20 positions
model2 = pyvolve.Model("nucleotide")
part2 = pyvolve.Partition(models=model2, size=20)

# Define second model and partition. This partition has a length of 100 positions
model3 = pyvolve.Model("nucleotide",
                       rate_factors=[0.5, 1.6, 4.1],
                       rate_probs=[0.75, 0.2, 0.05])
part3 = pyvolve.Partition(models=model3, size=100)

# Provide all partitions *in the order in which they should be evolved* to Evolver and evolve
my_evolver = pyvolve.Evolver(partitions=[part1, part2, part3], tree=my_tree)
my_evolver()
Exemplo n.º 25
0
def evolve_convergent_partitions(g):
    num_fl = foreground.get_num_foreground_lineages(tree=g['tree'])
    model_names = [
        'root',
    ] + ['m' + str(i + 1) for i in range(num_fl)]
    num_convergent_partition = g['num_convergent_site']
    convergent_partitions = list()
    biased_substitution_fractions = list()
    current_site = 0
    for partition_index in numpy.arange(num_convergent_partition):
        current_site += 1
        biased_aas = get_biased_amino_acids(g['convergent_amino_acids'],
                                            g['codon_table'])
        print('Codon site {}; Biased amino acids = {}; '.format(
            current_site, ''.join(biased_aas)),
              end='')
        biased_nsy_sub_index = get_biased_nonsynonymous_substitution_index(
            biased_aas, g['codon_table'], g['pyvolve_codon_orders'])
        biased_Q = apply_percent_biased_sub(
            mat=g['background_Q'],
            percent_biased_sub=g['percent_biased_sub'],
            target_index=biased_nsy_sub_index,
            biased_aas=biased_aas,
            codon_table=g['codon_table'],
            codon_orders=g['pyvolve_codon_orders'],
            all_nsy_cdn_index=g['all_nsy_cdn_index'],
            all_syn_cdn_index=g['all_syn_cdn_index'],
            foreground_omega=g['foreground_omega'],
        )
        total_nsy_Q = get_total_Q(biased_Q, g['all_nsy_cdn_index'])
        total_biased_Q = get_total_biased_Q(biased_Q, biased_aas,
                                            g['codon_table'],
                                            g['pyvolve_codon_orders'])
        fraction_biased_Q = total_biased_Q / total_nsy_Q
        bg_total_nsy_Q = get_total_Q(g['background_Q'], g['all_nsy_cdn_index'])
        bg_total_biased_Q = get_total_biased_Q(g['background_Q'], biased_aas,
                                               g['codon_table'],
                                               g['pyvolve_codon_orders'])
        bg_fraction_biased_Q = bg_total_biased_Q / bg_total_nsy_Q
        txt = 'Total in Q toward the codons before and after the bias introduction = ' \
              '{:,.1f}% ({:,.1f}/{:,.1f}) and {:,.1f}% ({:,.1f}/{:,.1f})'
        print(
            txt.format(bg_fraction_biased_Q * 100, bg_total_biased_Q,
                       bg_total_nsy_Q, fraction_biased_Q * 100, total_biased_Q,
                       total_nsy_Q))
        biased_substitution_fractions.append(fraction_biased_Q)
        models = list()
        for model_name in model_names:
            is_nonroot_model = (model_name != 'root')
            if (is_nonroot_model):
                q_matrix = copy.copy(biased_Q)
            else:
                q_matrix = copy.copy(g['background_Q'])
            with suppress_stdout_stderr():
                model = pyvolve.Model(model_type='custom',
                                      name=model_name,
                                      parameters={'matrix': q_matrix})
            models.append(model)
        partition = pyvolve.Partition(models=models,
                                      size=1,
                                      root_model_name='root')
        convergent_partitions.append(partition)
    if len(biased_substitution_fractions):
        mean_biased_substitution_fraction = numpy.array(
            biased_substitution_fractions).mean()
    else:
        mean_biased_substitution_fraction = 0
    txt = '{:,.2f}% of substitutions in {} sites in the foreground branches are ' \
          'expected to result from the introduced bias in Q matrix.'
    fraction_convergent_site = g['num_convergent_site'] / g[
        'num_simulated_site']
    print(
        txt.format(
            mean_biased_substitution_fraction * fraction_convergent_site * 100,
            g['num_simulated_site']))
    txt = '{:,.2f}% of substitutions in {} convergent sites in the foreground branches are ' \
          'expected to result from the introduced bias in Q matrix.'
    print(
        txt.format(mean_biased_substitution_fraction * 100,
                   g['num_convergent_site']))
    evolver = pyvolve.Evolver(partitions=convergent_partitions,
                              tree=g['foreground_tree'])
    evolver(ratefile='tmp.csubst.simulate_convergent_ratefile.txt',
            infofile='tmp.csubst.simulate_convergent_infofile.txt',
            seqfile='tmp.csubst.simulate_convergent.fa',
            write_anc=False)
Exemplo n.º 26
0
def pyvolvePartitions(model, divselection=None):
    """Get list of `pyvolve` partitions for `model`.

    Args:
        `model` (`phydmslib.models.Models` object)
            The model used for the simulations. Currently only
            certain `Models` are supported (e.g., `YNGKP`,
            `ExpCM`)
        `divselection` (`None` or 2-tuple `(divomega, divsites)`)
            Set this option if you want to simulate a subset of sites
            as under diversifying selection (e.g., an `omega` different
            than that used by `model`. In this case, `divomega` is
            the omega for this subset of sites, and `divsites` is a list
            of the sites in 1, 2, ... numbering.

    Returns:
        `partitions` (`list` of `pyvolve.Partition` objects)
            Can be fed into `pyvolve.Evolver` to simulate evolution.
    """
    codons = pyvolve.genetics.Genetics().codons
    codon_dict = pyvolve.genetics.Genetics().codon_dict
    pyrims = pyvolve.genetics.Genetics().pyrims
    purines = pyvolve.genetics.Genetics().purines

    if divselection:
        (divomega, divsites) = divselection
    else:
        divsites = []

    assert all([1 <= r <= model.nsites for r in divsites])

    partitions = []
    for r in range(model.nsites):
        matrix = scipy.zeros((len(codons), len(codons)), dtype='float')
        for (xi, x) in enumerate(codons):
            for (yi, y) in enumerate(codons):
                ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]]
                if len(ntdiffs) == 1:
                    (xnt, ynt) = ntdiffs[0]
                    qxy = 1.0
                    if (xnt in purines) == (ynt in purines):
                        qxy *= model.kappa
                    (xaa, yaa) = (codon_dict[x], codon_dict[y])
                    fxy = 1.0
                    if xaa != yaa:
                        if type(
                                model
                        ) == phydmslib.models.ExpCM_empirical_phi_divpressure:
                            fxy *= model.omega * (
                                1 + model.omega2 * model.deltar[r])
                        elif r + 1 in divsites:
                            fxy *= divomega
                        else:
                            fxy *= model.omega
                    if type(model) in [
                            phydmslib.models.ExpCM,
                            phydmslib.models.ExpCM_empirical_phi,
                            phydmslib.models.ExpCM_empirical_phi_divpressure
                    ]:
                        qxy *= model.phi[NT_TO_INDEX[ynt]]
                        pix = model.pi[r][AA_TO_INDEX[xaa]]**model.beta
                        piy = model.pi[r][AA_TO_INDEX[yaa]]**model.beta
                        if abs(pix - piy) > ALMOST_ZERO:
                            fxy *= math.log(piy / pix) / (1.0 - pix / piy)
                    elif type(model) == phydmslib.models.YNGKP_M0:
                        for p in range(3):
                            qxy *= model.phi[p][NT_TO_INDEX[y[p]]]
                    else:
                        raise ValueError("Can't handle model type {0}".format(
                            type(model)))
                    matrix[xi][yi] = model.mu * qxy * fxy
            matrix[xi][xi] = -matrix[xi].sum()

        # create model in way that captures annoying print statements in pyvolve
        old_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')
        try:
            m = pyvolve.Model("custom", {"matrix": matrix})
        finally:
            sys.stdout.close()
            sys.stdout = old_stdout
        partitions.append(pyvolve.Partition(models=m, size=1))

    return partitions
Exemplo n.º 27
0
def get_c(L, kappa):

    ancestor = generate_ancestor(L)
    print(ancestor)

    phylogeny = pyvolve.read_tree(
        tree='((t1:0.5,t2:0.5)i1:0.5,(t3:0.5,t4:0.5)i2:0.5)root;')
    # '(t4:0.785,(t3:0.380,(t2:0.806,(t5:0.612,t1:0.660)i1:0.762)i2:0.921)i3:0.207)root;')
    # ((s1,s2)n1,(s3,s4)n2)n3
    pyvolve.print_tree(phylogeny)

    freqs = [0.25, 0.25, 0.25, 0.25]

    nuc_model = pyvolve.Model('nucleotide', {
        'kappa': 1.86836732388,
        'state_freqs': freqs
    })

    my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor)

    my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny)
    my_evolver()
    # my_evolver(write_anc = True)
    strains = my_evolver.get_sequences()
    # strains = my_evolver.get_sequences(anc = True)
    strain_names = list(strains.keys())

    n = len(strain_names)
    site_counts = L * [
        None
    ]  # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide
    strains_with_site = L * [
        None
    ]  # list of the strains that have a convergent mutation at each site; index = site
    for x in range(L):
        site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
        strains_with_site[x] = []
    # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide

    for s1 in range(n):
        strain1 = strains[strain_names[s1]]
        for s2 in range(s1, n):
            strain2 = strains[strain_names[s2]]
            for site in range(L):
                if strain1[site] == strain2[
                        site] and strain1[site] != ancestor[site]:
                    if strain1 not in strains_with_site[
                            site]:  # avoids double counting strain1 as convergent at that site
                        strains_with_site[site].append(strain1)
                        site_counts[site][strain1[site]] += 1
                    if strain2 not in strains_with_site[
                            site]:  # avoids double counting strain2 as convergent at that site
                        strains_with_site[site].append(strain2)
                        site_counts[site][strain2[site]] += 1

    c_q = (n - 1) * [
        None
    ]  # list of the number of convergent mutations between q strains; index = q - 2
    nucleotides = ['A', 'T', 'G', 'C']
    for x in range(n - 1):
        c_q[x] = 0
    for site in site_counts:
        for base in nucleotides:
            for q in range(2, n + 1):
                if site[base] == (q):
                    c_q[q - 2] += 1

    c = sum(c_q)
    print(c)
    return c
Exemplo n.º 28
0
def main(strain, seedFilepath, gffFilepath):
    for record in SeqIO.parse(seedFilepath, "fasta"):
        seedRec = record
        break
    gff_df = read_gff(gffFilepath)

    #get all the shuffle region
    prv = 0
    pos_lst = []
    for _, row in gff_df.iterrows():
        pos_lst.append(("nc", prv, row["start"] - 1, "+"))
        pos_lst.append(("c", row["start"] - 1, row["end"], row["strand"]))
        prv = row["end"]
    pos_lst.append(("nc", prv, len(seedRec), "+"))

    # configuration for evolution
    treeFilepath = "tmp.tree"
    mytree = pyvolve.read_tree(file=treeFilepath)
    ncm = pyvolve.Model("nucleotide")  # non-coding model
    cm = pyvolve.Model("ECMrest")  # coding model

    outputSeq_lst = [Seq("") for _ in range(4)]  # assuming tree has 4 nodes
    for pos in pos_lst:
        category, start, end, strand = pos

        # get rootSeq according to start, end, strand info
        rootSeq = seedRec.seq[start:end]
        if strand == "-":
            rootSeq = rootSeq.reverse_complement()
        rootSeq = str(rootSeq)

        # get simulated sequences
        if category == "nc":
            #            partition = pyvolve.Partition(models = ncm, root_sequence = rootSeq)
            #            evolver = pyvolve.Evolver(partition = partition, tree = mytree)
            #            rec_lst = get_evolved(evolver)
            rec_lst = [SeqRecord(Seq(rootSeq)) for _ in range(4)]
        elif category == "c":
            partition = pyvolve.Partition(
                models=cm,
                root_sequence=rootSeq[3:-3])  #remove start & stop codon
            evolver = pyvolve.Evolver(partition=partition, tree=mytree)
            rec_lst = get_evolved(evolver)
            for rec in rec_lst:
                rec.seq = rootSeq[:3] + rec.seq + rootSeq[
                    -3:]  #add last stop codon back
        assert len(rec_lst) == len(outputSeq_lst)

        # concat to outputSeq_lst
        for i, rec in enumerate(rec_lst):
            simSeq = rec.seq
            if strand == "-":
                simSeq = simSeq.reverse_complement()
            outputSeq_lst[i] += simSeq

    for i, outputSeq in enumerate(outputSeq_lst):
        genomeId = "{}_sim{}".format(strain, i + 1)
        outFilepath = "../data/dnaseq/{}.dnaseq".format(genomeId)
        with open(outFilepath, "w") as f:
            seqname = "{}:seq".format(genomeId)
            rec = SeqRecord(outputSeq, id=seqname, description="")
            SeqIO.write(rec, f, "fasta")
        print("DONE: output {}".format(outFilepath))
Exemplo n.º 29
0
# This example script demonstrates how to evolve according to a nucleotide model with *branch* rate heterogeneity. The approach is the same for non-nucleotide models.

import pyvolve

# Define a phylogeny. For clarity, we define this tree with a string. The tree contains model flags for branches which should evolve according to new models. Flags are represented as _name_, where underscores surround the name.
my_tree = pyvolve.read_tree(
    tree="((t1:0.5, t2:0.5):0.5_m1_,(t3:0.5, t4:0.5):0.5_m2_));")

# Define a model for each flag. Models should be given names with the keyword argument `name`. These names *MUST* have correspondingly named flags in the tree!
model1 = pyvolve.Model("nucleotide", {"kappa": 3.5}, name="m1")
model2 = pyvolve.Model("nucleotide", {"kappa": 4.75}, name="m2")
rootmodel = pyvolve.Model(
    "nucleotide", name="root"
)  # We can also define, if we want, a model for the ROOT of the tree that is separate from either of these models.

# Define partition will all models as a list. Include the argument `root_model_name` to indicate the NAME ATTRIBUTE of the model that should be used at the root of the tree. This name's corresponding object must be in the `models` list. Note that a separate root model is not needed - you could easily just start with _m1_ at the root, but you'd still need to give "m1" to `root_model_name`.
my_partition = pyvolve.Partition(models=[model1, model2, rootmodel],
                                 size=250,
                                 root_model_name="root")

# Evolve!
my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree)
my_evolver()
Exemplo n.º 30
0
        # have a relevant (and low) changce of back mutations (i.e. two changes at the same site)
        # as these are very rare for the organism studied.

        # read tree and determine root to tip distance
        max_rtt = max([x['dist_to_root'] for x in t2n])
        scaling_factor = prop_bases_mutated / max_rtt
        for node in bdtree.traverse():
            node.dist = node.dist * scaling_factor
        bdtree.write(outfile=tree_filename, format=3)

        # now we use the pyvolve module, see http://sjspielman.org/pyvolve/
        # Spielman, SJ and Wilke, CO. 2015.
        # Pyvolve: A flexible Python module for simulating sequences along phylogenies. PLOS ONE. 10(9): e0139047.
        t = pyvolve.read_tree(tree=bdtree.write(format=3))
        m = pyvolve.Model("nucleotide")
        p = pyvolve.Partition(models=m, root_sequence=miniseq)
        e = pyvolve.Evolver(partitions=p, tree=t)

        # Run evolution
        e()

        # Recover sequences from the evolution;
        # write output to file.
        simulated_sequences = e.get_sequences()
        with open(sequence_filename, 'wt') as f:
            for key in sorted(simulated_sequences.keys()):
                f.write("{0}\t{1}\n".format(key, simulated_sequences[key]))
        with open(fasta_filename, 'wt') as f:
            for key in sorted(simulated_sequences.keys()):
                f.write(">{0}\n{1}\n".format(key, simulated_sequences[key]))