Пример #1
0
def _evolve_sequence(tree, L, gtr):
    """
    Produce random sequence of a given length L, evolve it on a given tree
    using the given gtr model.
    """
    if isinstance(tree, str):
        tree = Phylo.read(tree, 'newick')

    root_seq = np.random.choice(gtr.alphabet, p=gtr.Pi, size=1000)
    tree.root.ref_seq = root_seq
    print("Started sequence evolution...")

    for node in tree.find_clades():

        for c in node.clades:
            c.up = node

        if hasattr(node, 'ref_seq'):
            continue

        t = node.branch_length
        p = gtr.propagate_profile(
            treetime.seq_utils.seq2prof(node.up.ref_seq, gtr.profile_map), t)
        # normalie profile
        p = (p.T / p.sum(axis=1)).T

        # sample mutations randomly
        ref_seq_idxs = np.array([
            int(np.random.choice(np.arange(p.shape[1]), p=p[k]))
            for k in np.arange(p.shape[0])
        ])
        node.ref_seq = np.array([gtr.alphabet[k] for k in ref_seq_idxs])

    records = [
        Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name)
        for k in tree.get_terminals()
    ]

    aln = Align.MultipleSeqAlignment(records)
    #full_aln = Align.MultipleSeqAlignment(full_records)
    print("Sequence evolution done...")
    return root_seq, aln
Пример #2
0
            genome, gene = entry.name.split('|')
        else:
            genome, gene = entry.name.split('_')

        if genome in genomes[aln]:
            sys.exit('\t**Error, duplicated genome in %s: %s' %(aln, genome))

        genomes[aln].add(genome)

genome_union = set.union(*genomes.values())

missing_genes = {} # just to keep track of the number of missing marker genes in each genome
concatenation = {}
for genome in genome_union:
    missing_genes[genome]             = 0
    concatenation[genome]             = Align.SeqRecord( Align.Seq('', aln_alphabet) )
    concatenation[genome].name        = genome
    concatenation[genome].id          = genome
    concatenation[genome].description = genome

#
# fill the handles with the marker sequences from each genome
total_genes      = 0.0 # keep track of the number of genes added to the concatenation
current_position = 1
partitions       = open('%s/concatenated_partitions' %output_folder, 'wb')
for aln in os.listdir(aln_folder):

    tmp_aln      = AlignIO.read( '%s/%s' %(aln_folder, aln), 'fasta' )
    aln_length   = tmp_aln.get_alignment_length() # get the expected size of the alignment so you can compare if all have the same size
    total_genes += aln_length
Пример #3
0
#!/usr/bin/env python

import Bio.Align as align, Bio.SeqIO as sio, Bio, Bio.AlignIO as aio
import sys
import subprocess as spc

seqs = [align.SeqRecord(align.Seq(           'ATGATGGGGGATGATG')),\
              align.SeqRecord(align.Seq(           'ATGATGATGATG')),\
              ]

m_proc = spc.Popen('muscle -clw',
                   stdin=spc.PIPE,
                   stdout=spc.PIPE,
                   stderr=spc.PIPE,
                   shell=True)

sio.write(seqs, m_proc.stdin, "fasta")
m_proc.stdin.close()
align = aio.read(m_proc.stdout, "clustal")

print align
Пример #4
0
def evolve_seq(treefile,
               basename,
               mu=0.0001,
               L=1000,
               mygtr=treetime.GTR.standard('jc')):
    """
    Generate a random sequence of a given length, and evolve it on the tree

    Args:
     - treefile: filename for the tree, on which a sequence should be evolved.
     - basename: filename prefix to save alignments.
     - mu: mutation rate. The units of the mutation rate should be consistent with
     the tree branch length
     - L: sequence length.
     - mygtr: GTR model for sequence evolution

    """
    from treetime import seq_utils
    from Bio import Phylo, AlignIO
    import numpy as np
    from itertools import izip

    mygtr.mu = mu
    tree = Phylo.read(treefile, 'newick')
    tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=L)
    print("Started sequence evolution...")
    mu_real = 0.0
    n_branches = 0
    #print ("Root sequence: " + ''.join(tree.root.ref_seq))
    for node in tree.find_clades():
        for c in node.clades:
            c.up = node
        if hasattr(node, 'ref_seq'):
            continue
        t = node.branch_length
        p = mygtr.propagate_profile(
            seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t)
        # normalie profile
        p = (p.T / p.sum(axis=1)).T
        # sample mutations randomly
        ref_seq_idxs = np.array([
            int(np.random.choice(np.arange(p.shape[1]), p=p[k]))
            for k in np.arange(p.shape[0])
        ])
        node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs])
        node.ref_mutations = [
            (anc, pos, der)
            for pos, (anc,
                      der) in enumerate(izip(node.up.ref_seq, node.ref_seq))
            if anc != der
        ]
        #print (node.name, len(node.ref_mutations))
        mu_real += 1.0 * (node.ref_seq != node.up.ref_seq).sum() / L
        n_branches += t
    mu_real /= n_branches
    print("Mutation rate is {}".format(mu_real))
    records = [
        Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name)
        for k in tree.get_terminals()
    ]
    full_records = [
        Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name)
        for k in tree.get_terminals()
    ]
    #import ipdb; ipdb.set_trace()
    aln = Align.MultipleSeqAlignment(records)
    full_aln = Align.MultipleSeqAlignment(full_records)
    print("Sequence evolution done...")

    # save results
    AlignIO.write(aln, basename + '.aln.ev.fasta', 'fasta')
    AlignIO.write(full_aln, basename + '.aln.ev_full.fasta', 'fasta')

    return aln, full_aln, mu_real