def finalize():
        if GC.random_number_seed is not None:
            from warnings import warn
            warn(
                "random_number_seed specified, but Pyvolve does not support seeding its random generator"
            )
        makedirs("pyvolve_output", exist_ok=True)
        label_to_node = MF.modules['TreeNode'].label_to_node()
        for root, treestr in GC.pruned_newick_trees:
            # run Pyvolve
            treestr = treestr.strip()
            label = root.get_label()
            rootseq = root.get_seq()
            if GC.VERBOSE:
                print('[%s] Pyvolve evolving sequences on tree: %s' %
                      (datetime.now(), treestr),
                      file=stderr)
                print('[%s] Pyvolve root sequence: %s' %
                      (datetime.now(), rootseq),
                      file=stderr)
            if treestr != '(':
                treestr = '(%s);' % treestr[:-1]
            try:
                tree = pyvolve.read_tree(tree=treestr)
                partition = pyvolve.Partition(models=GC.pyvolve_model,
                                              root_sequence=rootseq)
                evolver = pyvolve.Evolver(partitions=partition, tree=tree)
            except NameError:
                import pyvolve
                tree = pyvolve.read_tree(tree=treestr)
                partition = pyvolve.Partition(models=GC.pyvolve_model,
                                              root_sequence=rootseq)
                evolver = pyvolve.Evolver(partitions=partition, tree=tree)
            except AssertionError:
                assert False, "Error setting up Pyvolve. Tree: %s" % treestr
            ratefile = "pyvolve_output/%s_ratefile.txt" % label  # set each to None to not generate these files
            infofile = "pyvolve_output/%s_infofile.txt" % label
            seqfile = "pyvolve_output/%s_seqfile.fasta" % label
            evolver(ratefile=ratefile, infofile=infofile, seqfile=seqfile)
            seqs = evolver.get_sequences(
            )  # use anc=True to get internal sequences as well

            # store leaf sequences in GlobalContext
            if not hasattr(
                    GC, 'final_sequences'
            ):  # GC.final_sequences[cn_node][t] = set of (label,seq) tuples
                GC.final_sequences = {}
            for leaf in seqs:
                seq = seqs[leaf]
                virus_label, cn_label, sample_time = leaf.split('|')
                sample_time = float(sample_time)
                if cn_label not in GC.final_sequences:
                    GC.final_sequences[cn_label] = {}
                if sample_time not in GC.final_sequences[cn_label]:
                    GC.final_sequences[cn_label][sample_time] = []
                GC.final_sequences[cn_label][sample_time].append((leaf, seq))
def simulate(f, seqfile, tree, mu_dict, length):
    ''' Simulate single partition according homogeneous mutation-selection model.
    '''
    
    try:
        my_tree = pyvolve.read_tree(file = tree)
    except:
        my_tree = pyvolve.read_tree(tree = tree) 

    model = pyvolve.Model("MutSel", {'state_freqs':f, 'mu': mu_dict})

    part = pyvolve.Partition(size = length, models = model)    
    e = pyvolve.Evolver(partitions = part, tree = my_tree)
    e(seqfile = seqfile, ratefile = None, infofile = None)
예제 #3
0
def simulate(f, seqfile, tree, mu_dict, length):
    ''' Simulate single partition according homogeneous mutation-selection model.
    '''

    try:
        my_tree = pyvolve.read_tree(file=tree)
    except:
        my_tree = pyvolve.read_tree(tree=tree)

    model = pyvolve.Model("MutSel", {'state_freqs': f, 'mu': mu_dict})

    part = pyvolve.Partition(size=length, models=model)
    e = pyvolve.Evolver(partitions=part, tree=my_tree)
    e(seqfile=seqfile, ratefile=None, infofile=None)
def get_random_tree(filename, tree_string, L, kappa):

	# strains = read_in_strains(filename)
	# # L = genome_length(strains)
	# min_m = get_min_m(strains, L)
	# scaled_tree_string = scale_newick_format_tree(strains, L, min_m, tree_string)

	phylogeny = pyvolve.read_tree(tree = tree_string)
	# pyvolve.print_tree(phylogeny)

	freqs = [0.25,0.25,0.25,0.25]

	nuc_model = pyvolve.Model('nucleotide', {'kappa':kappa, 'state_freqs':freqs})

	ancestor = generate_ancestor(L)
	print(ancestor)

	my_partition = pyvolve.Partition(models = nuc_model, root_sequence = ancestor)

	my_evolver = pyvolve.Evolver(partitions = my_partition, tree = phylogeny)
	my_evolver() 
	# my_evolver(write_anc = True)
	simulated_strains = my_evolver.get_sequences()
	# strains = my_evolver.get_sequences(anc = True)
	# strain_names = list(strains.keys())
	pi = pi_value(simulated_strains)
	theta = theta_value(simulated_strains)

	# print('pi: ' + str(pi))
	# print('theta: ' + str(theta))

	return {'pi': pi, 'theta': theta}


	
예제 #5
0
    def test_OnSimulatedData(self):
        random.seed(1)
        divpressuresites = random.sample(range(self.nsites), 5)
        partitions = phydmslib.simulate.pyvolvePartitions(self.model,
                (200.0, divpressuresites))
        evolver = pyvolve.Evolver(partitions=partitions,
                tree=pyvolve.read_tree(file=self.tree))
        simulateprefix = os.path.join(self.outdir, self.modelname)
        simulatedalignment = simulateprefix + '_simulatedalignment.fasta'
        info = simulateprefix + '_temp_info.txt'
        rates = simulateprefix + '_temp_ratefile.txt'
        evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates)
        subprocess.check_call(['phydms', simulatedalignment, self.tree,
                self.modelarg, simulateprefix, '--omegabysite',
                '--brlen', 'scale'])
        omegabysitefile = simulateprefix + '_omegabysite.txt'
        omegas = pandas.read_csv(omegabysitefile, sep='\t', comment='#')
        divpressureomegas = omegas[omegas['site'].isin(divpressuresites)]
        self.assertTrue(len(divpressureomegas) == len(divpressuresites))
        self.assertTrue((divpressureomegas['omega'].values > 2).all(),
                "Not all divpressure sites have omega > 2:\n{0}".format(
                divpressureomegas))
        self.assertTrue((divpressureomegas['P'].values < 0.08).all(),
                "Not all divpressure sites have P < 0.08:\n{0}".format(
                divpressureomegas))
        nspurious = len(omegas[(omegas['omega'] > 2) & (omegas['P'] < 0.05)
                & (~omegas['site'].isin(divpressuresites))])
        self.assertTrue(nspurious <= 1, "{0} spurious sites".format(nspurious))

        for f in ["custom_matrix_frequencies.txt"]:
            if os.path.isfile(f):
                os.remove(f)
예제 #6
0
    def run_u(self, tree_file, sequences_folder):

        with open(tree_file) as f:
            line = f.readline().strip()
            if "(" not in line or line == ";":
                return None
            else:
                my_tree = ete3.Tree(line, format=1)

        root = my_tree.get_tree_root()
        root.name = "Root"

        # in this case we need to read the multipliers
        # First we apply the multipliers per family
        # Second, the multipliers per species tree branch

        gf_multiplier = self.gf_multipliers[tree_file.split("_")[-2].split("/")[-1]]

        for node in my_tree.traverse():
            node.dist = node.dist * gf_multiplier * self.st_multipliers[node.name.split("_")[0]]

        tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"])
        name_mapping = self.get_mapping_internal_names(tree, my_tree)
        partition = pyvolve.Partition(models=self.model, size=self.size)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)
        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_") +  "complete.fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)
        # Correct the names
        self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
예제 #7
0
def evolve(newicks, sequence_size, scale_tree):
    temp = "temporary_sequences.fasta"
    phy_files = []
    my_model = pyvolve.Model("nucleotide")
    partition = pyvolve.Partition(models = my_model, size = sequence_size)
    for i in range(0, len(newicks)):

        newick = newicks[i]
        tree = pyvolve.read_tree(tree = newick, scale_tree = scale_tree)
        my_evolver = pyvolve.Evolver(tree = tree, partitions = partition)
        fasta_seqfile = "temp" + str(i) + ".fasta"
        phylip_seqfile = "temp" + str(i) + ".phyl"
        phy_files.append(phylip_seqfile)

        my_evolver(seqfile=fasta_seqfile, seqfmt = "fasta", ratefile = None, infofile = None)
        fasta_to_phyl(fasta_seqfile, phylip_seqfile)

        os.remove(fasta_seqfile)

    phyl_output = "temp_seq.phyl"

    with open(phyl_output, 'w') as outfile:
        for fname in phy_files:
            with open(fname) as infile:
                outfile.write(infile.read())
                outfile.write("\n")
            os.remove(fname)

    return phyl_output
예제 #8
0
파일: utils.py 프로젝트: vinnub/Ig_SHM
 def get_pyvolve_phylogeny_from_nxgraph(G, root_seq):
     '''
     Transform the clonal graph into the format required by pyvolve. 
     '''
     tree_newick = networkx_to_newick(G)
     scale_tree = 1 / len(root_seq)
     tree = pyvolve.read_tree(
         tree=tree_newick.replace('a', 'a:' + str(scale_tree)))
     return tree
예제 #9
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        scipy.random.seed(1)

        self.underflowfreq = 1

        # define tree
        self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;')
        tempfile = '_temp.tree'
        with open(tempfile, 'w') as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, 'newick')
        os.remove(tempfile)

        # amino-acid preferences
        self.nsites = 50
        prefs = []
        minpref = 0.02
        g = scipy.random.dirichlet([5] * N_NT)
        for r in range(self.nsites):
            rprefs = scipy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nseqs = self.tree.count_terminals()
        expcm = phydmslib.models.ExpCM(prefs)
        partitions = phydmslib.simulate.pyvolvePartitions(expcm)
        alignment = '_temp_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, 'fasta')]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs

        # define model
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))
        if self.DISTRIBUTIONMODEL is None:
            pass
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))
예제 #10
0
def get_pyvolve_tree(tree, foreground_scaling_factor=1):
    if (foreground_scaling_factor != 1):
        print('Foreground branches are rescaled by {}.'.format(
            foreground_scaling_factor))
    for node in tree.traverse():
        if node.is_foreground:
            node.dist *= foreground_scaling_factor
    newick_txt = get_pyvolve_newick(tree=tree)
    pyvolve_tree = pyvolve.read_tree(tree=newick_txt)
    return pyvolve_tree
예제 #11
0
    def test_OnSimulatedData(self):
        """Run ``phydms`` on the simulated data."""
        random.seed(1)
        numpy.random.seed(1)
        partitions = phydmslib.simulate.pyvolvePartitions(self.model)
        evolver = pyvolve.Evolver(partitions=partitions,
                                  tree=pyvolve.read_tree(file=self.tree))
        simulateprefix = os.path.join(self.outdir, self.modelname)
        simulatedalignment = simulateprefix + '_simulatedalignment.fasta'
        info = simulateprefix + '_temp_info.txt'
        rates = simulateprefix + '_temp_ratefile.txt'
        evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates)

        prefsbymethod = {}
        for fitprefsmethod in ['1', '2']:
            outprefix = simulateprefix + '_fitprefsmethod{0}'.format(
                    fitprefsmethod)
            subprocess.check_call(['phydms', simulatedalignment, self.tree,
                                   self.modelarg, outprefix,
                                   '--diffprefsbysite', '--brlen', 'scale',
                                   '--ncpus', '-1', '--diffprefsprior',
                                   'invquadratic,150,0.5'] +
                                  self.gammaomega_arg +
                                  ['--fitprefsmethod', fitprefsmethod])
            diffprefsbysitefile = outprefix + '_diffprefsbysite.txt'
            aas = ['dpi_{0}'.format(INDEX_TO_AA[a]) for a in range(N_AA)]
            diffprefs = pandas.read_csv(diffprefsbysitefile, sep='\t',
                                        comment='#')
            diffprefs['total'] = diffprefs[aas].abs().sum(axis=1)
            for (site, a) in self.targetaas.items():
                siteentry = diffprefs[diffprefs['site'] == site]
                self.assertTrue(len(siteentry) == 1, str(len(siteentry)))
                self.assertTrue((siteentry['dpi_{0}'.format(a)] > 0).all())

            prefsbymethod[fitprefsmethod] = diffprefs

        for (i, (method1, prefs1)) in enumerate(sorted(prefsbymethod.items())):
            total1 = prefs1['total'].values
            for (method2, prefs2) in sorted(prefsbymethod.items())[i + 1:]:
                total2 = prefs2['total'].values
                (r, p) = scipy.stats.pearsonr(total1, total2)
                plt.scatter(total1, total2)
                plt.xlabel('fitprefsmethod{0}'.format(method1))
                plt.ylabel('fitprefsmethod{0}'.format(method2))
                plotfile = os.path.join(self.outdir, '{0}_vs_{1}.pdf'.format(
                        method1, method2))
                plt.savefig(plotfile)
                self.assertTrue(r > 0.98, "Low correlation between "
                                "fitprefsmethods: {0}\nSee {1}"
                                .format(r, plotfile))

        for f in ["custom_matrix_frequencies.txt"]:
            if os.path.isfile(f):
                os.remove(f)
예제 #12
0
def get_random_tree(L, species, scaled_tree_string, kappa, iteration):
    # strains = read_in_strains(filename)
    # L = genome_length(strains)
    # min_m = get_min_m(strains, L)
    # max_m = get_max_m(strains, L, tree_string)
    # pis = []
    # thetas = []

    # scaled_trees = []

    # for x in range(min_m,max_m+1):
    # 	scaled_tree_string = scale_newick_format_tree(strains, L, x, tree_string, increment)
    # 	scaled_trees.append(scaled_tree_string)

    # for tree in scaled_trees:
    phylogeny = pyvolve.read_tree(tree=scaled_tree_string)
    print('read in the tree')
    pyvolve.print_tree(phylogeny)

    freqs = [0.25, 0.25, 0.25, 0.25]

    nuc_model = pyvolve.Model('nucleotide', {
        'kappa': kappa,
        'state_freqs': freqs
    })

    ancestor = generate_ancestor(L)
    print('generated an ancestor')
    # 	# print(ancestor)

    my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor)

    my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny)
    my_evolver(ratefile=None,
               infofile=None,
               seqfile="simulated_alignment_" + str(species[:-1]) +
               "_universal_" + str(iteration + 1) + ".fasta")
    # 	# my_evolver()
    print('evolved the sequences')
    # 	# my_evolver(write_anc = True)
    simulated_strains = my_evolver.get_sequences()
    # 	# strains = my_evolver.get_sequences(anc = True)
    # 	# strain_names = list(strains.keys())
    pi = pi_value(simulated_strains)
    theta = theta_value(simulated_strains)
    # 	pis.append(pi)
    # 	thetas.append(theta)

    # # print('pi: ' + str(pi))
    # # print('theta: ' + str(theta))

    # return {'pi': pis, 'theta': thetas}

    return pi, theta
예제 #13
0
def simulateAlignment(model, treeFile, alignmentPrefix, randomSeed=False):
    """
    Simulate an alignment given a model and tree (units = subs/site).

    Simulations done using `pyvolve`.

    Args:
        `model` (`phydmslib.models.Models` object)
            The model used for the simulations. Only
            models that can be passed to `pyvolve.Partitions`
            are supported.
        `treeFile` (str)
            Name of newick file used to simulate the sequences.
            The branch lengths should be in substitutions per site,
            which is the default units for all `phydms` outputs.
        `alignmentPrefix`
            Prefix for the files created by `pyvolve`.

    The result of this function is a simulated FASTA alignment
    file with the name having the prefix giving by `alignmentPrefix`
    and the suffix `'_simulatedalignment.fasta'`.
    """
    if randomSeed == False:
        pass
    else:
        random.seed(randomSeed)

    #Transform the branch lengths by dividing by the model `branchScale`
    tree = Bio.Phylo.read(treeFile, 'newick')
    for node in tree.get_terminals() + tree.get_nonterminals():
        if (node.branch_length == None) and (node == tree.root):
            node.branch_length = 1e-06
        else:
            node.branch_length /= model.branchScale
    fd, temp_path = mkstemp()
    Bio.Phylo.write(tree, temp_path, 'newick')
    os.close(fd)
    pyvolve_tree = pyvolve.read_tree(file=temp_path)
    os.remove(temp_path)

    #Make the `pyvolve` partition
    partitions = pyvolvePartitions(model)

    #Simulate the alignment
    alignment = '{0}_simulatedalignment.fasta'.format(alignmentPrefix)
    info = '_temp_{0}info.txt'.format(alignmentPrefix)
    rates = '_temp_{0}_ratefile.txt'.format(alignmentPrefix)
    evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve_tree)
    evolver(seqfile=alignment, infofile=info, ratefile=rates)
    for f in [rates, info, "custom_matrix_frequencies.txt"]:
        if os.path.isfile(f):
            os.remove(f)
    assert os.path.isfile(alignment)
예제 #14
0
def engrave_tree(treestr, brk, nbranch, lfile):
    """Inscribe pyvolve model flags into a tree string according to the given pattern."""
    model_flags = [
        "_bp%d_" % (len(brk) - brk.index(i)) if i in brk else ''
        for i in xrange(0, nbranch)
    ]
    branch_strs = re.split(r':', treestr)
    flagged = (re.sub(r'(\d+\.\d+[eE]?-?\d*)', r':\1' + f, b)
               for b, f in zip(branch_strs[1:], model_flags))
    l_tree_string = branch_strs[0] + ''.join(flagged) + ';'
    lfile.write(l_tree_string + '\n')
    ltree = pyvolve.read_tree(tree=l_tree_string)
    return ltree
예제 #15
0
def sim_gtr(rates, freqs, alpha, tree, nsites):
    import pyvolve
    custom_mu = {}
    
    for r, val in zip(['AC', 'AG', 'AT', 'CG', 'CT', 'CG'], rates):
        custom_mu[r]= val
    
    gtr_model = pyvolve.Model('nucleotide', {'mu':custom_mu, 'state_freqs':freqs}, alpha = alpha, num_categories = 4)
    gtr_partition = pyvolve.Partition(models = gtr_model, size = nsites)
    tr = pyvolve.read_tree(tree = tree)
    gtr_evolver = pyvolve.Evolver(partitions = gtr_partition, tree = tr)
    gtr_evolver()
    return dict_to_matrix(gtr_evolver.get_sequences())
예제 #16
0
def sim_codon(freqs, omegas, tree, nsites):
    import pyvolve
    
    tr = pyvolve.read_tree(tree = tree)

    #temporary convert tree
    
    gy_model = pyvolve.Model('MG', {'omega':omegas, 'state_freqs':freqs})
    # Note that the number of sites should be divided by 3!
# test using three partitions
    gy_partition = pyvolve.Partition(models = gy_model, size = nsites/3)
    gy_evolver = pyvolve.Evolver(partitions = gy_partition, tree = tr)
    gy_evolver()
    return dict_to_matrix(gy_evolver.get_sequences())
예제 #17
0
def evolveLinker(sequence, branchLength):
    """
    Evolves non-domain sequence a specified distance using pyvolve. Simulates substitutions only
    (no indels). branchLength * sequence is the expected fraction of positions to mutate (with
    replacement). Returns sequence post modification.
    """
    m = Model("JTT")
    p = Partition(models=m, root_sequence=sequence)
    #t = (A:BL,b:BL)
    t = read_tree(tree="(A:" + str(branchLength) + ",B:" + str(branchLength) +
                  ");")
    e = Evolver(partitions=p, tree=t)
    e()
    return e.get_sequences()["A"]
예제 #18
0
def get_accurate_c(L, kappa):

    ancestor = generate_ancestor(L)
    print(ancestor)

    # phylogeny = pyvolve.read_tree(tree = '(  (t1:0.5,t2:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 ,  (t5:0.5,t6:0.5)i3:0.5, (t7:0.5,t8:0.5)i4:0.5  ) root;')
    phylogeny = pyvolve.read_tree(
        tree=
        '(  ((t7:0.5,t8:0.5)i4:0.5,(t5:0.5,t6:0.5)i3:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5  ) root;'
    )

    pyvolve.print_tree(phylogeny)

    freqs = [0.25, 0.25, 0.25, 0.25]

    nuc_model = pyvolve.Model('nucleotide', {
        'kappa': 1.86836732388,
        'state_freqs': freqs
    })

    my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor)

    my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny)
    # my_evolver()
    my_evolver(write_anc=True)
    # strains = my_evolver.get_sequences()
    strains = my_evolver.get_sequences(anc=True)
    strain_names = list(strains.keys())  # pre-order traversal of the tree
    n = len(strain_names)

    print(strain_names)

    c_sites = {}
    for key in strain_names:
        c_sites[key] = []

    site_counts = L * [
        None
    ]  # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide
    strains_with_site = L * [
        None
    ]  # list of the strains that have a convergent mutation at each site; index = site
    for x in range(L):
        site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
        strains_with_site[x] = []
    # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide

    c = 0
    strain_names
예제 #19
0
def execute(tree, model, length, out, numSim):

    # read in model, tree, and define partition
    pyvolveModel = pyvolve.Model(model)
    pyvolveTree = pyvolve.read_tree(file=tree)
    pyvolvePartition = pyvolve.Partition(models=pyvolveModel, size=int(length))

    # create evolver
    my_evolver = pyvolve.Evolver(tree=pyvolveTree, partitions=pyvolvePartition)
    my_evolver()

    print("Simulating sequences...")
    # create simluated sequences
    for i in range(int(numSim)):
        print(str(out) + "." + str(i) + ".fa")
        my_evolver(seqfile=str(out) + "." + str(model) + "-" + str(i) + ".fa")
예제 #20
0
    def run(self, tree_file, sequences_folder):

        with open(tree_file) as f:

            line = f.readline().strip()
            if "(" not in line or line == ";":
                return None
            else:
                my_tree = ete3.Tree(line, format=1)

        tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"])
        name_mapping = self.get_mapping_internal_names(tree, my_tree)
        partition = pyvolve.Partition(models=self.model, size=self.size)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)
        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)

        # Correct the names
        self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
예제 #21
0
파일: phynn.py 프로젝트: rdvelazquez/PhyNN
def exampleFastaGenerator(nwkFile, fastaOutputLocation, seqLength, rate=1):
    # Tree.
    treeName = nwkFile[nwkFile.rindex('/'):]
    treeName = treeName.split('.')[0]
    phylogony = pyvolve.read_tree(file=nwkFile)
    # Rates.
    mutationRates = {
        "AC": rate,
        "AG": rate,
        "AT": rate,
        "CG": rate,
        "CT": rate,
        "GT": rate
    }
    # Model.
    model = pyvolve.Model("nucleotide", {"mu": mutationRates})
    partition = pyvolve.Partition(models=model, size=seqLength)
    # Evolver.
    evolver = pyvolve.Evolver(partitions=[partition], tree=phylogony)
    evolver(seqfile=fastaOutputLocation, ratefile=None, infofile=None)
예제 #22
0
    def simulate_single_sequence(self, name, gene_length, tree_file, sequences_folder):

        my_tree = "(A:1,B:1);".replace("A",name)
        tree = pyvolve.read_tree(tree=my_tree)
        partition = pyvolve.Partition(models=self.model, size=gene_length)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)

        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)

        # Select single sequence

        entries = list()

        for n, v in af.fasta_reader(os.path.join(sequences_folder, fasta_file)):
            if n[1:] != name:
                continue
            else:
                entries.append((n,v))
        af.fasta_writer(os.path.join(sequences_folder, fasta_file), entries)
예제 #23
0
def generateTree(tns, ntaxa, seqlen):
    #Construct the tree and save as newick file
    t = dendropy.simulate.treesim.birth_death_tree(birth_rate=1.0, death_rate=0, taxon_namespace=tns, num_extant_tips=ntaxa)
    t.write(path='/tmp/pyvt', schema='newick', suppress_rooting=True, suppress_internal_node_labels=True)
    
    #Set pyvolve data type
    m1 = pyvolve.Model("nucleotide")
    p1 = pyvolve.Partition(models=m1, size=seqlen)
    
    #Read tree from dendropy
    pot = pyvolve.read_tree(file='/tmp/pyvt')
    
    #Simulate evolution with no save file
    e1 = pyvolve.Evolver(tree=pot, partitions=p1)
    e1(seqfile=None)
    
    seqs = e1.get_sequences()
    
    ds=dendropy.DnaCharacterMatrix.from_dict(seqs, taxon_namespace=tns)
    ds.write(path="evolvedsequences.fasta", schema="fasta")
    #print ds
    return t
예제 #24
0
def simulate(tree_index,length):
    """
        Inputs: tree (integer 0-2)
        Outputs: array of 4 sequences, using the tree from above
    """
    tree_map = ["alpha","beta","charlie"]
    tree = tree_map[tree_index]
    my_tree = pyvolve.read_tree(file = "trees/"+tree+".tre")

    #Idk weird pyvolve paramets
    parameters_omega = {"omega": 0.65}
    parameters_alpha_beta = {"beta": 0.65, "alpha": 0.98} # Corresponds to dN/dS = 0.65 / 0.98
    my_model = pyvolve.Model("MG", parameters_alpha_beta)

    # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions (for a codon alignment, this means 250 codons, i.e. 750 nucleotide sites)
    my_partition = pyvolve.Partition(models = my_model, size = length)

    # Evolve!
    my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree, ratefile = None, infofile = None)
    my_evolver(ratefile = None, infofile = None)

    #Extract the sequences
    simulated_sequences = list(my_evolver.get_sequences().values())
    return simulated_sequences
예제 #25
0
def cli(gnumber,
        glist,
        gtree,
        edprob,
        gsize,
        glen_range,
        dnds,
        tau=None,
        delrate=0.0,
        from_al=None,
        protlike=False,
        no_syn=False,
        sub_rate=1.0,
        min_cons=0.0,
        outdir=""):
    """Extract genome content based on a list of species """
    gleaf = []
    no_edit = []
    tree = None
    if gnumber:
        gleaf = ['Genome_{}'.format(i) for i in range(1, gnumber + 1)]
    elif glist:
        with open(glist) as G:
            for line in Glist:
                line = line.strip()
                if line and not line.startswith('#'):
                    gleaf.append(line.strip('-_'))
                    if line.startswith('-') or line.startswith('_'):
                        no_edit.append(line.strip('-_'))
    elif gtree:
        tree = Tree(gtree)
        gleaf = tree.get_leaf_names()
        no_edit = [x.strip('_') for x in gleaf if x.startswith('_')]
        for node in tree:
            node.name = node.name.strip('_')

    else:
        raise NotImplementedError(
            "One of --gnumber, --glist and --gtree is needed !")

    if not tree:
        tree = Tree()
        tree.populate(len(gleaf), names_library=gleaf, random_branches=True)

    param_list = {"alpha": dnds[1], "beta": dnds[0]}
    if tau:
        param_list.update({"kappa": tau})

    if from_al:  # read codons frequencies from an existing alignment
        f = pyvolve.ReadFrequencies("codon", file=from_al)
        param_list.update({'state_freqs': f.compute_frequencies()})

    #print(tree.get_ascii(show_internal=True, attributes=['name', 'dist']))
    phylogeny = pyvolve.read_tree(tree=tree.write(format=5),
                                  scale_tree=sub_rate)
    codon_model = pyvolve.Model("codon", param_list)  #, neutral_scaling=True)
    sequences = []
    edited_sequences = []
    truth_tables = []
    # add height to tree
    tree = add_height_to_tree(tree)

    for i in range(gsize):
        # gene length is given from an uniform distribution
        alen = np.random.randint(glen_range[0], glen_range[1]) * 3
        seq = simulate_genomes(codon_model, phylogeny, alen, outdir, i + 1)
        if delrate:
            seq = random_deletion(seq, tree, alen // 3, delrate)
        if protlike:
            for k in seq:
                seq[k] = 'ATG' + seq[k]
        sequences.append(seq)
        edited_seq, truth_table = CtoUsimulate(seq,
                                               tree,
                                               no_edit,
                                               edprob,
                                               no_syn=no_syn,
                                               min_cons=min_cons)
        edited_sequences.append(edited_seq)
        truth_tables.append(truth_table)
        save_data(tree, seq, edited_seq, truth_table, outdir, i + 1)
예제 #26
0
Simulate sequences along a phylogenetic tree using pyvolve

@author: david
"""
import pyvolve

"User defined params"
mut_rate = 0.005
freqs = [0.25, 0.25, 0.25, 0.25]
seq_length = 1000
kappa = 2.75

"Read in phylogeny along which Pyvolve should simulate"
"Scale_tree sets absolute mutation rate"
my_tree = pyvolve.read_tree(file = "AMR-sim.tre", scale_tree = mut_rate)
#pyvolve.print_tree(my_tree) # Print the parsed phylogeny

"Specify nucleotide substitution model with custom rates"
#custom_mu = {"AC":0.5, "AG":0.25, "AT":1.23, "CG":0.55, "CT":1.22, "GT":0.47}
#nuc_model = pyvolve.Model( "nucleotide", {"mu":custom_mu, "state_freqs":freqs} )

"Or just use an HKY model with kappa"
nuc_model = pyvolve.Model( "nucleotide", {"kappa":kappa, "state_freqs":freqs})

"Define a Partition object which evolves set # of positions according to my_model"
my_partition = pyvolve.Partition(models = nuc_model, size = seq_length)
#my_partition = pyvolve.Partition(models = nuc_model, root_sequence = "GATAGAAC") # Or with a root seq

"Define an Evolver instance to evolve a single partition"
my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree) 
예제 #27
0
# This example script demonstrates how to evolve according to an amino-acid model with sitewise rate heterogeneity.

import pyvolve

# Define a phylogeny, from a file containing a newick tree
my_tree = pyvolve.read_tree(file="file_with_tree.tre")

# Define a nucleotide model, as a pyvolve.Model object. For this example, we'll use default parameters, but see the example script custom_aminoacid.py for other options

# To implement rate heterogeneity, do either of these:
## 1) Custom rates: Provide a list of rate_factors when defining a Model object. These rate factors will be assigned to sites with equal probability by default. To change this, provide probabilities with the argument `rate_probs`.
## 2) Gamma rates: Provide the keyword arguments num_categories and alpha when defining a Model object. <num_categories> rates will be drawn from a gamma distribution with shape and scale parameter each equal to <alpha>. These rates will be equiprobable, unless overridden by `rate_probs`.

# Several model definitions are shown below (first argument can be a different model, as desired).

# custom rates
my_model1 = pyvolve.Model(
    "WAG", rate_factors=[0.3, 0.8, 1.5,
                         2.45])  # 25% of sites will have each factor.
my_model2 = pyvolve.Model(
    "WAG",
    rate_factors=[0.3, 0.8, 1.5, 2.45],
    rate_probs=[0.7, 0.2, 0.05, 0.05]
)  # 70% of sites evolve with rate of 0.3, 20% with a rate of 0.8, 5% with a rate of 1.5, and 5% with a rate of 2.45

# gamma rates
my_model3 = pyvolve.Model("WAG", alpha=0.6, num_categories=5)

# Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions
my_partition = pyvolve.Partition(models=my_model2, size=250)
예제 #28
0
def main():
    """Main body of script."""
    codons = pyvolve.genetics.Genetics().codons
    codon_dict = pyvolve.genetics.Genetics().codon_dict
    pyrims = pyvolve.genetics.Genetics().pyrims
    purines = pyvolve.genetics.Genetics().purines

    args = vars(ParseArguments().parse_args())
    print("Read the following command line arguments:")
    print("\n\t{0}".format("\n\t".join(
        ["{0} = {1}".format(key, value) for (key, value) in args.items()])))

    print("\nPerforming simulation with pyvolve version {0}".format(
        pyvolve.__version__))

    print("\nReading model params from {0}".format(args['modelparams']))
    params = ReadParams(args['modelparams'])
    for (param, paramvalue) in params.items():
        print("The value of {0} is {1}".format(param, paramvalue))

    print("\nReading preferences from {0}".format(args['prefs']))
    tup = dms_tools.file_io.ReadPreferences(args['prefs'])
    (sites, pis) = (tup[0], tup[2])
    print("\nRead amino-acid preferences for {0} sites".format(len(pis)))

    tree = pyvolve.read_tree(file=args['tree'])

    # create models for simulation
    partitions = []
    for r in sites:
        if params['diversifyingsitesA'] and (int(r)
                                             in params['diversifyingsitesA']):
            omega = params['diversifyingomegaA']
            print r, omega
        elif params['diversifyingsitesB'] and (
                int(r) in params['diversifyingsitesB']):
            omega = params['diversifyingomegaB']
            print r, omega
        else:
            omega = 1.0
        matrix = []  # matrix[x][y] is rate of substitution from x to y
        for (xi, x) in enumerate(codons):
            row = []
            for (yi, y) in enumerate(codons):
                ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]]
                if len(ntdiffs) == 0:
                    assert x == y
                    row.append(
                        0)  # will later be adjusted to make row sum to zero
                elif len(ntdiffs) > 1:
                    # multi-nucleotide codon change
                    row.append(0)
                else:
                    # single nucleotide change
                    (xnt, ynt) = ntdiffs[0]
                    if (xnt in purines) == (ynt in purines):
                        # transition
                        qxy = params['kappa'] * params['phi{0}'.format(ynt)]
                    else:
                        # transversion
                        qxy = params['phi{0}'.format(ynt)]
                    (xaa, yaa) = (codon_dict[x], codon_dict[y])
                    if xaa == yaa:
                        fxy = 1.0
                    else:
                        pix = pis[r][xaa]**params['stringencyparameter']
                        piy = pis[r][yaa]**params['stringencyparameter']
                        if abs(pix - piy) < 1e-6:
                            fxy = omega
                        else:
                            fxy = omega * math.log(
                                piy / pix) / (1.0 - pix / piy)
                    row.append(qxy * fxy * params['scalerate'])
            assert len(row) == len(codons)
            row[xi] = -sum(row)
            matrix.append(row)
        model = pyvolve.Model("custom", {"matrix": matrix})
        partitions.append(pyvolve.Partition(models=model, size=1))

    print("\nSimulating evolution, writing to {0}...".format(
        args['simulatedalignment']))
    basename = os.path.splitext(args['simulatedalignment'])[0]
    evolver = pyvolve.Evolver(partitions=partitions, tree=tree)
    evolver(
        seqfile=args['simulatedalignment'],
        infofile='{0}_infofile.txt'.format(basename),
        ratefile='{0}_ratefile.txt'.format(basename),
    )
    print("Finished simulation")

    uniqueseqs = set([])
    uniquealignment = []
    ninitial = 0
    for seq in Bio.SeqIO.parse(args['simulatedalignment'], 'fasta'):
        ninitial += 1
        seqstr = str(seq.seq)
        if seqstr not in uniqueseqs:
            uniqueseqs.add(seqstr)
            uniquealignment.append(seq)
    print(
        "\nAfter removing redundant sequences, we have shrunk {0} from {1} to {2} sequences"
        .format(args['simulatedalignment'], ninitial, len(uniquealignment)))
    Bio.SeqIO.write(uniquealignment, args['simulatedalignment'], 'fasta')
예제 #29
0
    rates[j] = float(rates[j])

############### Loop ##########

for species in species_numbers:
    print species
    check_dir(path.join(outdir,species))
    os.chdir(path.join(outdir,species))    
    
    for size in sizes:
        print size
        check_dir(path.join(outdir, species, size))
        os.chdir(path.join(outdir, species, size))
        
        tree = path.join(treedir, species, size, "tree_file")
        current_tree = pyvolve.read_tree(file = tree)        
        
        for i in range(1,n_runs+1):
            
            check_dir(path.join(outdir, species, size, str(i)))
            os.chdir(path.join(outdir, species, size, str(i)))
            
            my_model = pyvolve.Model("codon", {"alpha":alphas, "beta":betas, "kappa":kappa}, rate_probs=rates)
    
            my_partition = pyvolve.Partition(models = my_model, size = n_sites)
            my_evolver = pyvolve.Evolver(partitions = my_partition, tree = current_tree)
            my_evolver()
            


예제 #30
0
if __name__ == "__main__":

    usage ='''
    python pyvolve-genseq.py <tree.nwk> <seq-size> [<scale> default=1 (no scale)]
    '''
    if len(sys.argv) < 3:
        sys.exit(usage)

    tree_f = sys.argv[1]
    outfiles = tree_f
    size = sys.argv[2]
    scale = 1
    scale = float(sys.argv[3]) if len(sys.argv) > 3 else None

    print("Reading tree..")
    my_tree = pyvolve.read_tree(file = tree_f, scale_tree=scale)
    my_model = pyvolve.Model("nucleotide")
    my_partition = pyvolve.Partition(models = my_model, size = int(size))

    print("Simulating sequences..")
    my_evolver = pyvolve.Evolver(tree = my_tree, partitions = my_partition)
    my_evolver(ratefile = "%s.%s.ratefile.txt" % (outfiles, size),
               infofile = "%s.%s.infofile.txt" % (outfiles, size),
               seqfile = "%s.%s.seqfile.fasta" % (outfiles, size) )

    print("Tree info..")
    tree_distances_info(tree_f, scale, int(size))

    print("Running ANI on sequences..")
    pyani_seq("%s.%s.seqfile.fasta" % (outfiles, size))
예제 #31
0
def main():
    """Main body of script."""
    codons = pyvolve.genetics.Genetics().codons
    codon_dict = pyvolve.genetics.Genetics().codon_dict
    pyrims = pyvolve.genetics.Genetics().pyrims
    purines = pyvolve.genetics.Genetics().purines

    args = vars(ParseArguments().parse_args())
    print("Read the following command line arguments:")
    print("\n\t{0}".format("\n\t".join(["{0} = {1}".format(key, value) for (key, value) in args.items()])))

    print("\nPerforming simulation with pyvolve version {0}".format(pyvolve.__version__))

    print("\nReading model params from {0}".format(args['modelparams']))
    params = ReadParams(args['modelparams'])
    for (param, paramvalue) in params.items():
        print("The value of {0} is {1}".format(param, paramvalue))
        
    print("\nReading preferences from {0}".format(args['prefs']))
    tup = dms_tools.file_io.ReadPreferences(args['prefs'])
    (sites, pis) = (tup[0], tup[2])
    print("\nRead amino-acid preferences for {0} sites".format(len(pis)))

    tree = pyvolve.read_tree(file=args['tree'])

    # create models for simulation
    partitions = []
    for r in sites:
        if params['diversifyingsitesA'] and (int(r) in params['diversifyingsitesA']):
            omega = params['diversifyingomegaA']
            print r,omega
        elif params['diversifyingsitesB'] and (int(r) in params['diversifyingsitesB']):
            omega = params['diversifyingomegaB']
            print r,omega
        else:
            omega = 1.0
        matrix = [] # matrix[x][y] is rate of substitution from x to y
        for (xi, x) in enumerate(codons):
            row = []
            for (yi, y) in enumerate(codons):
                ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]]
                if len(ntdiffs) == 0:
                    assert x == y
                    row.append(0) # will later be adjusted to make row sum to zero
                elif len(ntdiffs) > 1:
                    # multi-nucleotide codon change
                    row.append(0)
                else:
                    # single nucleotide change
                    (xnt, ynt) = ntdiffs[0]
                    if (xnt in purines) == (ynt in purines):
                        # transition
                        qxy = params['kappa'] * params['phi{0}'.format(ynt)]
                    else:
                        # transversion
                        qxy = params['phi{0}'.format(ynt)]
                    (xaa, yaa) = (codon_dict[x], codon_dict[y])
                    if xaa == yaa:
                        fxy = 1.0
                    else:
                        pix = pis[r][xaa]**params['stringencyparameter']
                        piy = pis[r][yaa]**params['stringencyparameter']
                        if abs(pix - piy) < 1e-6:
                            fxy = omega
                        else:
                            fxy = omega * math.log(piy / pix) / (1.0 - pix / piy)
                    row.append(qxy * fxy * params['scalerate'])
            assert len(row) == len(codons)
            row[xi] = -sum(row)
            matrix.append(row)          
        model = pyvolve.Model("custom", {"matrix":matrix})
        partitions.append(pyvolve.Partition(models=model, size=1))

    print("\nSimulating evolution, writing to {0}...".format(args['simulatedalignment']))
    basename = os.path.splitext(args['simulatedalignment'])[0]
    evolver = pyvolve.Evolver(partitions=partitions, tree=tree)
    evolver(
            seqfile=args['simulatedalignment'],
            infofile='{0}_infofile.txt'.format(basename),
            ratefile='{0}_ratefile.txt'.format(basename),
            )
    print("Finished simulation")

    uniqueseqs = set([])
    uniquealignment = []
    ninitial = 0
    for seq in Bio.SeqIO.parse(args['simulatedalignment'], 'fasta'):
        ninitial += 1
        seqstr = str(seq.seq)
        if seqstr not in uniqueseqs:
            uniqueseqs.add(seqstr)
            uniquealignment.append(seq)
    print("\nAfter removing redundant sequences, we have shrunk {0} from {1} to {2} sequences".format(args['simulatedalignment'], ninitial, len(uniquealignment)))
    Bio.SeqIO.write(uniquealignment, args['simulatedalignment'], 'fasta')
예제 #32
0
    def test_branchScale(self):
        """Simulate evolution, ensure scaled branches match number of subs."""

        scipy.random.seed(1)
        random.seed(1)

        # define model, only free parameter is mu for testing simulations
        nsites = 50
        prefs = []
        minpref = 0.01
        for r in range(nsites):
            rprefs = scipy.random.dirichlet([1] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        kappa = 4.2
        omega = 0.4
        beta = 1.5
        mu = 0.3
        if self.MODEL == phydmslib.models.ExpCM:
            phi = scipy.random.dirichlet([7] * N_NT)
            model = phydmslib.models.ExpCM(prefs,
                                           kappa=kappa,
                                           omega=omega,
                                           beta=beta,
                                           mu=mu,
                                           phi=phi,
                                           freeparams=['mu'])
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            g = scipy.random.dirichlet([7] * N_NT)
            model = phydmslib.models.ExpCM_empirical_phi(prefs,
                                                         g,
                                                         kappa=kappa,
                                                         omega=omega,
                                                         beta=beta,
                                                         mu=mu,
                                                         freeparams=['mu'])
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = scipy.asarray(
                [scipy.random.dirichlet([7] * N_NT) for i in range(3)])
            model = phydmslib.models.YNGKP_M0(e_pw, nsites)
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        else:
            raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL)))

        # tree is two sequences separated by a single branch
        t = 0.04 / model.branchScale
        newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0)
        pyvolvetree = pyvolve.read_tree(tree=newicktree)
        temptree = '_temp.tree'
        with open(temptree, 'w') as f:
            f.write(newicktree)
        biotree = Bio.Phylo.read(temptree, 'newick')
        os.remove(temptree)

        # Simulate evolution of two sequences separated by a long branch.
        # Then estimate subs per site in a heuristic way that will be
        # roughly correct for short branches. Do this all several times
        # and average results to get better accuracy.
        alignment = '_temp_branchScale_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        nsubs = 0  # subs in simulated seqs (estimate from Hamming distance)
        treedist = 0.0  # distance inferred by `TreeLikelihood`
        nreplicates = 100
        for i in range(nreplicates):
            evolver(seqfile=alignment, infofile=info, ratefile=rates)
            a = [(s.description, str(s.seq))
                 for s in Bio.SeqIO.parse(alignment, 'fasta')]
            assert len(a[0][1]) == len(a[1][1]) == nsites * 3
            for f in [alignment, info, rates]:
                if os.path.isfile(f):
                    os.remove(f)
            for r in range(nsites):
                codon1 = a[0][1][3 * r:3 * r + 3]
                codon2 = a[1][1][3 * r:3 * r + 3]
                nsubs += len([j for j in range(3) if codon1[j] != codon2[j]])
            tl = phydmslib.treelikelihood.TreeLikelihood(biotree, a, model)
            tl.maximizeLikelihood()
            treedist += sum([n.branch_length for n in tl.tree.get_terminals()])
        nsubs /= float(nsites * nreplicates)
        treedist /= float(nreplicates)

        # We expect nsubs = branchScale * t, but build in some tolerance
        # with rtol since we simulated finite number of sites.
        self.assertTrue(
            scipy.allclose(nsubs, model.branchScale * t, rtol=0.2),
            ("Simulated subs per site of {0} is not close "
             "to expected value of {1} (branchScale = {2}, t = {3})").format(
                 nsubs, t * model.branchScale, model.branchScale, t))
        self.assertTrue(
            scipy.allclose(treedist, nsubs, rtol=0.2),
            ("Simulated subs per site of {0} is not close to inferred "
             "branch length of {1}").format(nsubs, treedist))
예제 #33
0
# Usage example: /Users/fengqian/anaconda2/bin/python /Users/fengqian/Downloads/simulated_seqs.py /Users/fengqian/Downloads/UniMelb_shared-master/project/mosaic_data/Protein_translateable_pilot_upper_centroids.fasta /Users/fengqian/simulated_tree.txt /Users/fengqian/simulated_seqs.fasta
#######################################################################
import sys, os
import pyvolve
import glob
from mungo.fasta import FastaReader
from collections import defaultdict
input_fasta = sys.argv[1]
input_tree_txt = sys.argv[2]
output_seqs = sys.argv[3]

#f = pyvolve.ReadFrequencies("amino_acid", file = "/Users/fengqian/Downloads/UniMelb_shared-master/project/mosaic_data/Protein_translateable_pilot_upper_centroids.fasta")
#f = pyvolve.ReadFrequencies("amino_acid", file = "/data/cephfs/punim0609/qian_feng/snake_pipeline/data/Protein_translateable_pilot_upper_centroids.fasta")
f = pyvolve.ReadFrequencies("amino_acid", file=input_fasta)
frequencies = f.compute_frequencies()
my_tree_1 = pyvolve.read_tree(file=input_tree_txt, scale_tree=0.5)
my_model_1 = pyvolve.Model("MTMAM", {"state_freqs": frequencies})
my_partition_1 = pyvolve.Partition(models=my_model_1, size=200)
my_evolver_1 = pyvolve.Evolver(partitions=my_partition_1, tree=my_tree_1)
my_evolver_1(ratefile=None, infofile=None, seqfile=output_seqs)

seqs = {}
seq_list = []
count = 0
for h, s in FastaReader(output_seqs):
    seqs["seq" + str(count)] = s
    seq_list.append("seq" + str(count))
    count += 1
##organize the seq ID name
with open(output_seqs, 'w') as outfile:
    for s in seq_list:
예제 #34
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        scipy.random.seed(1)

        self.underflowfreq = 1

        # define tree
        self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;')
        tempfile = '_temp.tree'
        with open(tempfile, 'w') as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, 'newick')
        os.remove(tempfile)

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nsites = 50
        self.nseqs = self.tree.count_terminals()
        e_pw = scipy.ndarray((3, N_NT), dtype='float')
        e_pw.fill(0.25)
        yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0)
        alignment = '_temp_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, 'fasta')]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs

        # define model
        prefs = []
        minpref = 0.02
        g = scipy.random.dirichlet([5] * N_NT)
        for r in range(self.nsites):
            rprefs = scipy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure:
            divpressure = scipy.random.uniform(-1, 5, self.nsites)
            divpressure /= max(abs(divpressure))
            self.model = phydmslib.models.ExpCM_empirical_phi_divpressure(
                prefs, g, divpressure)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = scipy.random.uniform(0.2, 0.8, size=(3, N_NT))
            e_pw = e_pw / e_pw.sum(axis=1, keepdims=True)
            self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))

        if self.DISTRIBUTIONMODEL is None:
            pass
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))
# This example script demonstrates how to evolve according to a nucleotide model with sitewise rate heterogeneity. 

import pyvolve

# Define a phylogeny, from a file containing a newick tree
my_tree = pyvolve.read_tree(file = "file_with_tree.tre")

# Define a nucleotide model, as a pyvolve.Model object. For this example, we'll use default parameters, but see the example script custom_nucleotide.py for other options

# To implement rate heterogeneity, do either of these:
## 1) Custom rates: Provide a list of rate_factors when defining a Model object. These rate factors will be assigned to sites with equal probability by default. To change this, provide probabilities with the argument `rate_probs`.
## 2) Gamma rates: Provide the keyword arguments num_categories and alpha when defining a Model object. <num_categories> rates will be drawn from a gamma distribution with shape and scale parameter each equal to <alpha>. These rates will be equiprobable, unless overridden by `rate_probs`.

# Several model definitions are shown below:

# custom rates
my_model1 = pyvolve.Model("nucleotide", rate_factors = [0.3, 0.8, 1.5, 2.45] ) # 25% of sites will have each factor.
my_model2 = pyvolve.Model("nucleotide", rate_factors = [0.3, 0.8, 1.5, 2.45], rate_probs = [0.7, 0.2, 0.05, 0.05] ) # 70% of sites evolve with rate of 0.3, 20% with a rate of 0.8, 5% with a rate of 1.5, and 5% with a rate of 2.45

# gamma rates
my_model3 = pyvolve.Model("nucleotide", alpha = 0.4, num_categories = 3) 


# Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions
my_partition = pyvolve.Partition(models = my_model2, size = 250)

# Evolve!
my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree)
my_evolver()
# This example script demonstrates how to evolve according to a nucleotide model with *branch* rate heterogeneity. The approach is the same for non-nucleotide models.

import pyvolve

# Define a phylogeny. For clarity, we define this tree with a string. The tree contains model flags for branches which should evolve according to new models. Flags are represented as _name_, where underscores surround the name.
my_tree = pyvolve.read_tree(tree="((t1:0.5, t2:0.5):0.5_m1_,(t3:0.5, t4:0.5):0.5_m2_));")

# Define a model for each flag. Models should be given names with the keyword argument `name`. These names *MUST* have correspondingly named flags in the tree!
model1 = pyvolve.Model("nucleotide", {"kappa": 3.5}, name="m1")
model2 = pyvolve.Model("nucleotide", {"kappa": 4.75}, name="m2")
rootmodel = pyvolve.Model(
    "nucleotide", name="root"
)  # We can also define, if we want, a model for the ROOT of the tree that is separate from either of these models.

# Define partition will all models as a list. Include the argument `root_model_name` to indicate the NAME ATTRIBUTE of the model that should be used at the root of the tree. This name's corresponding object must be in the `models` list. Note that a separate root model is not needed - you could easily just start with _m1_ at the root, but you'd still need to give "m1" to `root_model_name`.
my_partition = pyvolve.Partition(models=[model1, model2, rootmodel], size=250, root_model_name="root")

# Evolve!
my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree)
my_evolver()
    def setUp(self):
        """Set up for tests."""
        scipy.random.seed(1)
        random.seed(1)

        nsites = 1
        minpref = 0.001
        self.prefs = []
        self.realprefs = []
        for r in range(nsites):
            rprefs = scipy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
            scipy.random.shuffle(rprefs)
            self.realprefs.append(dict(zip(sorted(AA_TO_INDEX.keys()),
                                           rprefs)))
        self.kappa = 3.0
        self.omega = 3.0
        self.phi = scipy.random.dirichlet([5] * N_NT)
        self.model = self.MODEL(self.prefs,
                                prior=None,
                                kappa=self.kappa,
                                omega=self.omega,
                                phi=self.phi)
        self.realmodel = phydmslib.models.ExpCM(self.realprefs,
                                                kappa=self.kappa,
                                                omega=self.omega,
                                                mu=10.0,
                                                phi=self.phi)

        treefile = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         './NP_data/NP_tree.newick'))
        self.tree = Bio.Phylo.read(treefile, 'newick')
        self.tree.root_at_midpoint()

        # simulate alignment using realmodel
        evolver = pyvolve.Evolver(
            partitions=phydmslib.simulate.pyvolvePartitions(self.realmodel),
            tree=pyvolve.read_tree(file=treefile))
        alignmentfile = '_temp_fitprefs_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver(seqfile=alignmentfile, infofile=info, ratefile=rates)
        self.alignment = phydmslib.file_io.ReadCodonAlignment(
            alignmentfile, True)
        assert len(self.alignment[0][1]) == nsites * 3
        for f in [alignmentfile, info, rates]:
            os.remove(f)
        self.codoncounts = dict([(r,
                                  dict([(INDEX_TO_CODON[c], 0)
                                        for c in range(N_CODON)]))
                                 for r in range(nsites)])
        self.aacounts = dict([(r, dict([(a, 0) for a in range(N_AA)]))
                              for r in range(nsites)])
        for (head, seq) in self.alignment:
            self.codoncounts[r][seq] += 1
            self.aacounts[r][CODON_TO_AA[CODON_TO_INDEX[seq]]] += 1

        self.tl = phydmslib.treelikelihood.TreeLikelihood(
            self.tree, self.alignment, self.model)
예제 #38
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        scipy.random.seed(1)

        # define tree
        self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;')
        tempfile = '_temp.tree'
        with open(tempfile, 'w') as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, 'newick')
        os.remove(tempfile)
        self.brlen = {}
        for (name,
             brlen) in re.findall(r'(?P<name>node\d):(?P<brlen>\d+\.\d+)',
                                  self.newick):
            if name != self.tree.root.name:
                i = name[-1]  # node number
                self.brlen[int(i)] = float(brlen)

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nsites = 60
        self.nseqs = self.tree.count_terminals()
        e_pw = scipy.ndarray((3, N_NT), dtype='float')
        e_pw.fill(0.25)
        yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0)
        alignment = '_temp_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, 'fasta')]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs
        self.codons = {}  # indexed by node, site, gives codon index
        for node in self.tree.get_terminals():
            node = node.name
            i = int(node[-1])
            self.codons[i] = {}
            seq = [seq for (head, seq) in self.alignment if node == head][0]
            for r in range(self.nsites):
                codon = seq[3 * r:3 * r + 3]
                self.codons[i][r] = CODON_TO_INDEX[codon]

        # define model
        prefs = []
        minpref = 0.02
        g = scipy.random.dirichlet([5] * N_NT)
        g[g < 0.1] = 0.1
        g /= g.sum()
        for r in range(self.nsites):
            rprefs = scipy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure:
            divpressure = scipy.random.uniform(-1, 5, self.nsites)
            divpressure /= max(abs(divpressure))
            self.model = phydmslib.models.ExpCM_empirical_phi_divpressure(
                prefs, g, divpressure)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = scipy.random.uniform(0.2, 0.8, size=(3, N_NT))
            e_pw = e_pw / e_pw.sum(axis=1, keepdims=True)
            self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))

        if self.DISTRIBUTIONMODEL is None:
            pass
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedBetaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))