Пример #1
0
def dnadist(seqs, output=None, verbose=True, force=False, args=None):
    if args == None:
        args = "y"

    validate_seqs(seqs)
    cwd = create_temp_dir()
    util.tic("dnadist on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), seqs)

    # run phylip
    exec_phylip("dnadist", args, verbose)

    util.toc()

    # parse output
    if output != None:
        os.rename("outfile", "../" + output)
        cleanup_temp_dir(cwd)
        return labels
    else:
        name, mat = read_dist_matrix("outfile")
        cleanup_temp_dir(cwd)
        return labels, mat
Пример #2
0
    def _test_ml_speed(self):

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in xrange(10):
            l = spidir.find_ml_branch_lengths_hky(tree,
                                                  util.mget(
                                                      align, tree.leafNames()),
                                                  bgfreq,
                                                  kappa,
                                                  maxiter=10)
        util.toc()

        dists.append([n.dist for n in nodes])
        likes.append(l)
Пример #3
0
def sample_thread(arg, seqs, rho=1.5e-8, mu=2.5e-8, popsize=1e4,
                  times=None, ntimes=20, maxtime=200000, verbose=False):

    if times is None:
        times = argweaver.get_time_points(
            ntimes=ntimes, maxtime=maxtime, delta=.01)
    popsizes = [popsize] * len(times)

    if verbose:
        util.tic("sample thread")

    trees, names = arg2ctrees(arg, times)

    seqs2 = [seqs[name] for name in names]

    new_name = [x for x in seqs.keys() if x not in names][0]
    names.append(new_name)
    seqs2.append(seqs[new_name])
    seqlen = len(seqs2[0])

    trees = argweaver_sample_thread(
        trees, times, len(times),
        popsizes, rho, mu,
        (C.c_char_p * len(seqs2))(*seqs2), len(seqs2), seqlen, None)
    arg = ctrees2arg(trees, names, times, verbose=verbose)

    if verbose:
        util.toc()

    return arg
Пример #4
0
def calc_joint_prob(arg, seqs, ntimes=20, mu=2.5e-8, rho=1.5e-8, popsizes=1e4,
                    times=None, verbose=False, delete_arg=True):
    """
    Calculate arg_joint_prob
    """
    if times is None:
        times = argweaver.get_time_points(
            ntimes=ntimes, maxtime=80000, delta=.01)
    if isinstance(popsizes, float) or isinstance(popsizes, int):
        popsizes = [popsizes] * len(times)

    if verbose:
        util.tic("calc likelihood")

    trees, names = arg2ctrees(arg, times)
    seqs, nseqs, seqlen = seqs2cseqs(seqs, names)

    p = argweaver_joint_prob(
        trees, times, len(times), popsizes, mu, rho, seqs, nseqs, seqlen)
    if delete_arg:
        delete_local_trees(trees)

    if verbose:
        util.toc()

    return p
    def addEvents(self, eventsfile):

        if not tableExists(self.cur, "Events"):
            self.makeEventsTable()

        util.tic("add events")
        events_tab = tablelib.read_table(eventsfile)
        events_lookup = events_tab.lookup("partid")

        self.cur.execute("SELECT famid FROM Families;")
        famids = [x[0] for x in self.cur]

        for famid in famids:
            if famid not in events_lookup:
                continue
            row = events_lookup[famid]

            for sp in self.stree.nodes:
                sp = str(sp)

                self.cur.execute(
                    """INSERT INTO Events VALUES
                    ("%s", "%s", %d, %d, %d, %d);""" %
                    (famid, sp,
                     row[sp+"-genes"],
                     row[sp+"-dup"],
                     row[sp+"-loss"],
                     row[sp+"-appear"]))
        util.toc()
    def addGoTerms(self, gofile):

        if not tableExists(self.cur, "GoTerms"):
            self.makeGoTermsTable()

        util.tic("add go terms")
        goterms = tablelib.read_table(gofile)
        goterms_lookup = goterms.groupby("orf")
        goterms_bygoid = goterms.groupby("goid")

        for goterm in goterms_bygoid:
            term = goterms_bygoid[goterm][0]

            if '"' in term["term"]:
                print term

            self.cur.execute("""INSERT INTO GoTerms VALUES ("%s", "%s")""" %
                             (term["goid"], term["term"]))

        for gene, terms in goterms_lookup.iteritems():
            for term in terms:
                self.cur.execute(
                    """INSERT INTO GeneGoTerms VALUES ("%s", "%s");""" %
                    (gene, term["goid"]))
        util.toc()
    def addFamilies(self, eventsfile, discard=[]):

        if not tableExists(self.cur, "Families"):
            self.makeFamiliesTable()

        util.tic("add families")
        events_tab = tablelib.read_table(eventsfile)
        events_lookup = events_tab.lookup("partid")
        familyGeneNames = self.makeFamilyGeneNames()
        discard = set(discard)

        for row in events_tab:
            famid = row["partid"]
            if famid in discard:
                util.logger("discarding '%s'" % famid)
                continue

            tree = treelib.read_tree(self.getTreeFile(famid))
            treelen = sum(x.dist for x in tree)
            seqs = fasta.read_fasta(self.getFastaFile(famid))
            seqlen = stats.median(map(len, seqs.values()))

            self.cur.execute(
                """INSERT INTO Families VALUES
                ("%s", "%s", %f, %f, %f, %d, %d, %d,
                "%s");""" %
                (row["partid"],
                 familyGeneNames.get(row["partid"], ("", ""))[0],
                 row["famrate"], treelen, seqlen * 3,
                 row["dup"], row["loss"], row["genes"],
                 familyGeneNames.get(row["partid"], ("", ""))[1]))
        util.toc()
Пример #8
0
    def recon_helper(self, nsearch=1000):
        """Perform reconciliation"""

        self.maxp = -util.INF
        self.maxrecon = None
        proposal = self.proposer.init_proposal()
        init_proposal = proposal.copy()

        for i in xrange(nsearch):
            if i % 10 == 0:
                print "search", i

            # evaluate the probability of proposal
            util.tic("eval")
            p = self.eval_proposal(proposal)
            util.toc()

            # evaluate the search, then keep or discard the proposal
            util.tic("prop")
            self.eval_search(p, proposal)
            proposal = self.proposer.next_proposal()  # set the next proposal
            util.toc()

        # all proposals bad, use initial proposal
        if not self.maxrecon:
            self.maxrecon = init_proposal

        # rename locus tree nodes
        dlcoal.rename_nodes(self.maxrecon.locus_tree,
                            self.name_internal)  # how about coal_tree names?

        return self.maxrecon
Пример #9
0
    def addGoTerms(self, gofile):

        if not tableExists(self.cur, "GoTerms"):
            self.makeGoTermsTable()

        util.tic("add go terms")
        goterms = tablelib.read_table(gofile)
        goterms_lookup = goterms.groupby("orf")
        goterms_bygoid = goterms.groupby("goid")

        for goterm in goterms_bygoid:
            term = goterms_bygoid[goterm][0]

            if '"' in term["term"]:
                print term

            self.cur.execute("""INSERT INTO GoTerms VALUES ("%s", "%s")""" %
                             (term["goid"], term["term"]))

        for gene, terms in goterms_lookup.iteritems():
            for term in terms:
                self.cur.execute(
                    """INSERT INTO GeneGoTerms VALUES ("%s", "%s");""" %
                    (gene, term["goid"]))
        util.toc()
Пример #10
0
def trainTree(conf, stree, gene2species):
    args = conf["REST"]
    treefiles = []
    
    for arg in args:
        treefiles.extend(util.shellparser(arg))

    util.tic("reading trees")
    trees = []
    prog = progress.ProgressBar(len(treefiles))
    for treefile in treefiles:
        prog.update()
        trees.append(treelib.read_tree(treefile))
        
        # even out top two branches
        totlen = trees[-1].root.children[0].dist + \
                 trees[-1].root.children[1].dist
        trees[-1].root.children[0].dist = totlen / 2.0
        trees[-1].root.children[1].dist = totlen / 2.0
        
    util.toc()
    
    params = Spidir.learnModel(trees, stree, gene2species, conf["trainstats"],
                               filenames=treefiles)
    
    Spidir.writeParams(conf["param"], params)
Пример #11
0
    def _test_ml_speed(self):
        
        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in xrange(10):
            l = spidir.find_ml_branch_lengths_hky(
                tree,
                util.mget(align, tree.leafNames()),
                bgfreq, kappa,
                maxiter=10)            
        util.toc()

        dists.append([n.dist for n in nodes])
        likes.append(l)
Пример #12
0
def setTreeDistances(conf, tree, distmat, genes):
    if isDebug(DEBUG_MED):
        util.tic("fit branch lengths")
    
    if pyspidir and "parsimony" in conf:
        # estimate branch lengths with parsimony
        parsimony_C(conf["aln"], tree)
        tree.data["error"] = sum(node.dist 
                                 for node in tree.nodes.itervalues())
    
    elif pyspidir and "mlhkydist" in conf:
        # estimate branch lengths with ML
        logl = mlhkydist_C(conf["aln"], tree, conf["bgfreq"], conf["tsvratio"], 
                           3*len(tree.nodes))
        tree.data["distlogl"] = logl
        tree.data["error"] = 0.0
    else:
        # perform LSE
        lse = phylo.least_square_error(tree, distmat, genes)

        # catch unusual case that may occur in greedy search
        if sum(x.dist for x in tree.nodes.values()) == 0:
            for node in tree.nodes.values():
                node.dist = .01

        tree.data["error"] = math.sqrt(scipy.dot(lse.resids, lse.resids)) / \
                                       sum(x.dist for x in tree.nodes.values())

        setBranchError(conf, tree, lse.resids, lse.paths, lse.edges, lse.topmat)
        
    if isDebug(DEBUG_MED):
        util.toc()
Пример #13
0
    def draw_placed(self):
        vis = []

        util.tic("create draw code")

        # draw frags
        for frag in self.frags:
            vis.append(self.frag_widget(frag))

        # draw genes
        for reg, l in self.region_layout.iteritems():
            vis.append(
                translate(l.x, l.y, self.gene_widget(self.db.get_region(reg))))

        # draw matches
        drawn = set()
        for frag in self.frags:
            vis.append(
                self.draw_matches(frag.genome, frag.chrom, frag.start,
                                  frag.end, drawn))

        util.toc()

        self.groupid = group(*vis)
        return self.groupid
Пример #14
0
    def addFamilies(self, eventsfile, discard=[]):

        if not tableExists(self.cur, "Families"):
            self.makeFamiliesTable()

        util.tic("add families")
        events_tab = tablelib.read_table(eventsfile)
        events_lookup = events_tab.lookup("partid")
        familyGeneNames = self.makeFamilyGeneNames()
        discard = set(discard)

        for row in events_tab:
            famid = row["partid"]
            if famid in discard:
                util.logger("discarding '%s'" % famid)
                continue

            tree = treelib.read_tree(self.getTreeFile(famid))
            treelen = sum(x.dist for x in tree)
            seqs = fasta.read_fasta(self.getFastaFile(famid))
            seqlen = stats.median(map(len, seqs.values()))

            self.cur.execute(
                """INSERT INTO Families VALUES 
                                ("%s", "%s", %f, %f, %f, %d, %d, %d,
                                 "%s");""" %
                (row["partid"], familyGeneNames.get(row["partid"],
                                                    ("", ""))[0],
                 row["famrate"], treelen, seqlen * 3, row["dup"], row["loss"],
                 row["genes"], familyGeneNames.get(row["partid"],
                                                   ("", ""))[1]))
        util.toc()
Пример #15
0
def dnadist(seqs, output=None, verbose=True, force = False, args=None):
    if args == None:
        args = "y"
    
    validate_seqs(seqs)
    cwd = create_temp_dir()
    util.tic("dnadist on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), seqs)
    
    # run phylip
    exec_phylip("dnadist", args, verbose)
    
    util.toc()    
    
    # parse output
    if output != None:
        os.rename("outfile", "../" + output)
        cleanup_temp_dir(cwd)
        return labels
    else:
        name, mat = read_dist_matrix("outfile")    
        cleanup_temp_dir(cwd)
        return labels, mat
Пример #16
0
    def addEvents(self, eventsfile):

        if not tableExists(self.cur, "Events"):
            self.makeEventsTable()

        util.tic("add events")
        events_tab = tablelib.read_table(eventsfile)
        events_lookup = events_tab.lookup("partid")

        self.cur.execute("SELECT famid FROM Families;")
        famids = [x[0] for x in self.cur]

        for famid in famids:
            if famid not in events_lookup:
                continue
            row = events_lookup[famid]

            for sp in self.stree.nodes:
                sp = str(sp)

                self.cur.execute(
                    """INSERT INTO Events VALUES 
                                ("%s", "%s", %d, %d, %d, %d);""" %
                    (famid, sp, row[sp + "-genes"], row[sp + "-dup"],
                     row[sp + "-loss"], row[sp + "-appear"]))
        util.toc()
Пример #17
0
 def processFunc():
     # remove old query tempfile if one exists
     if closure["oldtmp"] != None:
         os.remove(closure["oldtmp"])
         elapse = util.toc()
         closure["time"] += elapse
         
         util.log("blasted %d of %d sequences (%.1f%%), elapse %.0f m, left %.0f m" % (
             closure["index"], len(seqs.keys()), 
             100 * float(closure["index"]) / len(seqs.keys()),
             closure["time"] / 60.0, 
             elapse / split * (len(seqs.keys()) - closure["index"]) / 60.0))
         
     util.tic()
     
     # find new subset of query sequences
     i = closure["index"]
     names = seqs.keys()[i:i+split]
     
     # if no more sequences then quit
     if len(names) == 0:
         return False
     
     # start blast
     tmpfile = util.tempfile(".", "blastp", ".fasta")
     seqs.write(tmpfile, names = names)
     pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" % \
         (prog, databaseFile, tmpfile, options))
     
     # update variables
     closure["oldtmp"] = tmpfile
     closure["index"] = i + split
     
     return pipe
Пример #18
0
def boot_proml(seqs, iters = 100, seed = 1, jumble=5, output=None, 
               verbose=True, force = False):
    validate_seqs(seqs)
    cwd = create_temp_dir()
    util.tic("bootProml on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), seqs)
    
    exec_phylip("seqboot", "y\n%d" % seed, verbose)
    
    os.rename("outfile", "infile")
    exec_phylip("proml", "m\nD\n%d\n%d\n%d\ny" % (iters, seed, jumble), verbose)
    
    util.toc()        
    
    # read tree samples
    if output != None:
        os.rename("outtree", "../" + output)
        cleanup_temp_dir(cwd)
        return labels
    else:
        trees = []
        infile = file("outtree")
        for i in xrange(iters):
            tree = treelib.Tree()
            tree.read_newick(infile)
            rename_tree_with_names(tree, labels)
            trees.append(tree)
        infile.close()
        cleanup_temp_dir(cwd)
        return trees
Пример #19
0
 def processFunc():
     # remove old query tempfile if one exists
     if closure["oldtmp"] != None:
         os.remove(closure["oldtmp"])
         elapse = util.toc()
         closure["time"] += elapse
         
         util.log("blasted %d of %d sequences (%.1f%%), elapse %.0f m, left %.0f m" % (
             closure["index"], len(seqs.keys()), 
             100 * float(closure["index"]) / len(seqs.keys()),
             closure["time"] / 60.0, 
             elapse / split * (len(seqs.keys()) - closure["index"]) / 60.0))
         
     util.tic()
     
     # find new subset of query sequences
     i = closure["index"]
     names = seqs.keys()[i:i+split]
     
     # if no more sequences then quit
     if len(names) == 0:
         return False
     
     # start blast
     tmpfile = util.tempfile(".", "blastp", ".fasta")
     seqs.write(tmpfile, names = names)
     pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" % \
         (prog, databaseFile, tmpfile, options))
     
     # update variables
     closure["oldtmp"] = tmpfile
     closure["index"] = i + split
     
     return pipe
    def walk(node):
        for child in node.children:
            walk(child)

        if not node.is_leaf():
            blastfiles = []
            leaves1 = node.children[0].leaf_names()
            leaves2 = node.children[1].leaf_names()

            # determine sibling blast files
            for leaf1 in leaves1:
                for leaf2 in leaves2:
                    if leaf1 in blastFileLookup and \
                       leaf2 in blastFileLookup[leaf1]:
                        blastfiles.append(blastFileLookup[leaf1][leaf2])

            # determine outgroup blast files (all other files, potentially)
            # go up one level, blastfiles for leaves, and subtract
            # sibling files
            outblastfiles = []
            if node.parent:
                inleaves = leaves1 + leaves2
                outleaves = set(node.parent.leaf_names()) - set(inleaves)

                for leaf1 in inleaves:
                    for leaf2 in outleaves:
                        if leaf1 in blastFileLookup and \
                           leaf2 in blastFileLookup[leaf1]:
                            outblastfiles.append(blastFileLookup[leaf1][leaf2])

            util.tic("merging")
            util.logger("leaves1: ", leaves1)
            util.logger("leaves2: ", leaves2)

            if "merge" in conf and \
               conf["merge"] == "avg":
                node.parts = mergeAvg(conf,
                                      genes,
                                      node.children[0].parts,
                                      node.children[1].parts,
                                      blastfiles,
                                      outblastfiles)
            else:
                node.parts = mergeBuh(conf,
                                      genes,
                                      node.children[0].parts,
                                      node.children[1].parts,
                                      blastfiles)

            if "output" in conf and len(node.parts) > 0:
                util.write_delim(conf["output"] +
                                 str(node.name) +
                                 ".part", node.parts)

            util.logger("number of parts: ", len(node.parts))
            if len(node.parts) > 0:
                util.logger("largest part:", max(map(len, node.parts)))

            util.toc()
Пример #21
0
    def _test_ml(self):
        """Test ML code"""

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in range(40):
            l = spidir.find_ml_branch_lengths_hky(tree,
                                                  util.mget(
                                                      align, tree.leafNames()),
                                                  bgfreq,
                                                  kappa,
                                                  parsinit=False,
                                                  maxiter=1)

            dists.append([n.dist for n in nodes])
            likes.append(l)
        util.toc()

        print likes

        prep_dir("test/output/ml/")

        # distances plot
        util.rplot_start("test/output/ml/ml_branches.pdf")
        util.rplot("plot",
                   util.cget(dists, 0),
                   ylim=[0, max(dists[0])],
                   t="l",
                   main="branch length convergence",
                   xlab="iterations",
                   ylab="branch lengths (sub/site)")
        for d in zip(*dists):
            util.rplot("lines", d)
        util.rplot_end(True)

        print util.cget(dists, 4)

        # likelihood plot
        util.rplot_start("test/output/ml/ml_likelihood.pdf")
        util.rplot("plot",
                   likes,
                   t="l",
                   xlab="iterations",
                   ylab="log likelihood",
                   main="likelihood convergence")
        util.rplot_end(True)
Пример #22
0
def draw_raxml_tree(tr, adef):
    util.tic("Tree to string...")
    treestr = raxml.tree_to_string(tr, adef)
    util.toc()

    util.tic("Drawing tree...")
    T = treelib.parse_newick(treestr)
    T2 = treelib.unroot(T)
    treelib.draw_tree(T2, out=sys.stdout, minlen=5, maxlen=5)
    util.toc()
Пример #23
0
def draw_raxml_tree(tr, adef):
    util.tic("Tree to string...")
    treestr = raxml.tree_to_string(tr, adef)
    util.toc()

    util.tic("Drawing tree...")
    T = treelib.parse_newick(treestr)
    T2 = treelib.unroot(T)
    treelib.draw_tree(T2, out=sys.stdout, minlen=5, maxlen=5)
    util.toc()
def pamp(seqs, tree, seqtype="dna", saveOutput="", verbose=False, safe=True):
    
    if safe and seqtype == "dna":
        seqs = alignlib.mapalign(seqs, valfunc=removeStopCodons)
    
    phylip.validate_seqs(seqs)
    cwd = phylip.create_temp_dir()

    util.tic("pamp on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    nex = nexus.Nexus("align", "w")
    nex.write_matrix(seqs.keys(), seqs.values(), seqtype, seqs.alignlen())
    nex.close()
    treefile = open("tree", "w")
    treefile.write("%d 1\n" % len(tree.leaves()))
    tree.write(treefile, writeData=lambda x: "")
    treefile.close()
    
    # create control file
    out = file("pamp.ctl", "w")
    print >>out, "seqfile = align"
    print >>out, "treefile = tree"
    print >>out, "outfile = out"    

    if seqtype == "dna":
        print >>out, "seqtype = 0"
    elif seqtype == "pep":
        print >>out, "seqtype = 2"
    else:
        raise Exception("unknown seqtype '%s'" % seqtype)
    print >>out, "ncatG = 8"
    print >>out, "nhomo = 0"
    out.close()
    
    # run pamp
    if verbose:
        os.system("pamp paml.ctl")
    else:
        os.system("pamp paml.ctl > /dev/null")

    res = PamlResults("out")
    aln = res.getPampReconstruction()
    aln.write("recon.mfa")
    tree2 = res.getBranchNames()
    renameTreeAlign(tree2, aln)
    
    if saveOutput != "":
        phylip.save_temp_dir(cwd, saveOutput)
    else:
        phylip.cleanup_temp_dir(cwd)
    
    util.toc()

    return tree2, aln
Пример #25
0
def pamp(seqs, tree, seqtype="dna", saveOutput="", verbose=False, safe=True):

    if safe and seqtype == "dna":
        seqs = alignlib.mapalign(seqs, valfunc=removeStopCodons)

    phylip.validate_seqs(seqs)
    cwd = phylip.create_temp_dir()

    util.tic("pamp on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    nex = nexus.Nexus("align", "w")
    nex.write_matrix(seqs.keys(), seqs.values(), seqtype, seqs.alignlen())
    nex.close()
    treefile = open("tree", "w")
    treefile.write("%d 1\n" % len(tree.leaves()))
    tree.write(treefile, writeData=lambda x: "")
    treefile.close()

    # create control file
    out = file("pamp.ctl", "w")
    print >> out, "seqfile = align"
    print >> out, "treefile = tree"
    print >> out, "outfile = out"

    if seqtype == "dna":
        print >> out, "seqtype = 0"
    elif seqtype == "pep":
        print >> out, "seqtype = 2"
    else:
        raise Exception("unknown seqtype '%s'" % seqtype)
    print >> out, "ncatG = 8"
    print >> out, "nhomo = 0"
    out.close()

    # run pamp
    if verbose:
        os.system("pamp paml.ctl")
    else:
        os.system("pamp paml.ctl > /dev/null")

    res = PamlResults("out")
    aln = res.getPampReconstruction()
    aln.write("recon.mfa")
    tree2 = res.getBranchNames()
    renameTreeAlign(tree2, aln)

    if saveOutput != "":
        phylip.save_temp_dir(cwd, saveOutput)
    else:
        phylip.cleanup_temp_dir(cwd)

    util.toc()

    return tree2, aln
Пример #26
0
def resample_arg_region(arg, seqs, region_start, region_end,
                        ntimes=20, rho=1.5e-8, mu=2.5e-8,
                        popsizes=1e4, times=None, carg=False,
                        refine=1, verbose=False):
    """
    Sample ARG for sequences
    """
    if times is None:
        times = argweaver.get_time_points(
            ntimes=ntimes, maxtime=80000, delta=.01)
    if isinstance(popsizes, float) or isinstance(popsizes, int):
        popsizes = [popsizes] * len(times)

    if verbose:
        util.tic("resample arg")

    # convert arg to c++
    if verbose:
        util.tic("convert arg")
    trees, names = arg2ctrees(arg, times)
    if verbose:
        util.toc()

    # get sequences in same order
    # and add all other sequences not in arg yet
    leaves = set(names)
    for name, seq in seqs.items():
        if name not in leaves:
            names.append(name)
    seqs2, nseqs, seqlen = seqs2cseqs(seqs, names)

    # resample arg
    seqlen = len(seqs[names[0]])

    trees = argweaver_resample_arg_region(
        trees, times, len(times),
        popsizes, rho, mu, seqs2, nseqs, seqlen,
        region_start, region_end, refine)

    #trees = argweaver_resample_arg_region(
    #    trees, times, len(times),
    #    popsizes, rho, mu, seqs2, nseqs, seqlen,
    #    region_start, region_end)

    # convert arg back to python
    if carg:
        arg = (trees, names)
    else:
        arg = ctrees2arg(trees, names, times, verbose=verbose)

    if verbose:
        util.toc()

    return arg
Пример #27
0
    def _test_ml(self):
        """Test ML code"""

        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in range(40):
            l = spidir.find_ml_branch_lengths_hky(
                    tree,
                    util.mget(align, tree.leafNames()),
                    bgfreq, kappa,
                    parsinit=False,
                    maxiter=1)
            
            dists.append([n.dist for n in nodes])
            likes.append(l)
        util.toc()

        print likes

        prep_dir("test/output/ml/")

        # distances plot
        util.rplot_start("test/output/ml/ml_branches.pdf")
        util.rplot("plot", util.cget(dists, 0),
                   ylim=[0, max(dists[0])], t="l",
                   main="branch length convergence",
                   xlab="iterations",
                   ylab="branch lengths (sub/site)")
        for d in zip(* dists):
            util.rplot("lines", d)
        util.rplot_end(True)

        print util.cget(dists, 4)

        # likelihood plot
        util.rplot_start("test/output/ml/ml_likelihood.pdf")
        util.rplot("plot", likes, t="l",
                   xlab="iterations",
                   ylab="log likelihood",
                   main="likelihood convergence")
        util.rplot_end(True)
Пример #28
0
def sample_all_arg(seqs, ntimes=20, rho=1.5e-8, mu=2.5e-8, popsizes=1e4,
                   refine=1, times=None, verbose=False, carg=False,
                   prob_path_switch=.1):
    """
    Sample ARG for sequences
    """
    if times is None:
        times = argweaver.get_time_points(
            ntimes=ntimes, maxtime=80000, delta=.01)
    if isinstance(popsizes, float) or isinstance(popsizes, int):
        popsizes = [popsizes] * len(times)

    if verbose:
        util.tic("resample arg")

    # convert arg to c++
    if verbose:
        util.tic("convert arg")

    arg = argweaver.make_trunk_arg(
        0, len(seqs.values()[0]), name=seqs.keys()[0])
    trees, names = arg2ctrees(arg, times)
    if verbose:
        util.toc()

    # get sequences in same order
    # and add all other sequences not in arg yet
    seqs2 = [seqs[name] for name in names]
    leaves = set(names)
    for name, seq in seqs.items():
        if name not in leaves:
            names.append(name)
            seqs2.append(seq)

    # resample arg
    seqlen = len(seqs[names[0]])
    trees = argweaver_resample_all_arg(
        trees, times, len(times),
        popsizes, rho, mu,
        (C.c_char_p * len(seqs2))(*seqs2), len(seqs2),
        seqlen, refine, prob_path_switch)

    if carg:
        arg = (trees, names)
    else:
        # convert arg back to python
        arg = ctrees2arg(trees, names, times, verbose=verbose)

    if verbose:
        util.toc()

    return arg
Пример #29
0
    def walk(node):
        for child in node.children:
            walk(child)

        if not node.is_leaf():
            blastfiles = []
            leaves1 = node.children[0].leaf_names()
            leaves2 = node.children[1].leaf_names()

            # determine sibling blast files
            for leaf1 in leaves1:
                for leaf2 in leaves2:
                    if leaf1 in blastFileLookup and \
                       leaf2 in blastFileLookup[leaf1]:
                        blastfiles.append(blastFileLookup[leaf1][leaf2])

            # determine outgroup blast files (all other files, potentially)
            # go up one level, blastfiles for leaves, and subtract
            # sibling files
            outblastfiles = []
            if node.parent:
                inleaves = leaves1 + leaves2
                outleaves = set(node.parent.leaf_names()) - set(inleaves)

                for leaf1 in inleaves:
                    for leaf2 in outleaves:
                        if leaf1 in blastFileLookup and \
                           leaf2 in blastFileLookup[leaf1]:
                            outblastfiles.append(blastFileLookup[leaf1][leaf2])

            util.tic("merging")
            util.logger("leaves1: ", leaves1)
            util.logger("leaves2: ", leaves2)

            if "merge" in conf and \
               conf["merge"] == "avg":
                node.parts = mergeAvg(conf, genes, node.children[0].parts,
                                      node.children[1].parts, blastfiles,
                                      outblastfiles)
            else:
                node.parts = mergeBuh(conf, genes, node.children[0].parts,
                                      node.children[1].parts, blastfiles)

            if "output" in conf and len(node.parts) > 0:
                util.write_delim(conf["output"] + str(node.name) + ".part",
                                 node.parts)

            util.logger("number of parts: ", len(node.parts))
            if len(node.parts) > 0:
                util.logger("largest part:", max(map(len, node.parts)))

            util.toc()
Пример #30
0
def ctrees2arg(trees, names, times, verbose=False, delete_arg=True):
    """
    Convert a C data structure for the ARG into a python ARG
    """

    if verbose:
        util.tic("convert arg")

    # get local trees info
    nnodes = get_local_trees_nnodes(trees)
    ntrees = get_local_trees_ntrees(trees)

    # allocate data structures for treeset
    ptrees = []
    ages = []
    sprs = []
    blocklens = [0] * ntrees
    for i in range(ntrees):
        ptrees.append([0] * nnodes)
        ages.append([0] * nnodes)
        sprs.append([0, 0, 0, 0])

    # populate data structures
    get_local_trees_ptrees(trees, ptrees, ages, sprs, blocklens)

    # fully convert to python
    for i in range(ntrees):
        ptrees[i] = ptrees[i][:nnodes]
        ages[i] = ages[i][:nnodes]
        sprs[i] = sprs[i][:4]

    # convert treeset to arg data structure
    blocks = []
    start = 0
    for blocklen in blocklens:
        end = start + blocklen
        blocks.append((start, end))
        start = end

    assert len(names) == ((nnodes + 1) / 2)

    arg = treeset2arg(ptrees, ages, sprs, blocks, names, times)

    if delete_arg:
        delete_local_trees(trees)

    if verbose:
        util.toc()

    return arg
    def addGenes(self, species, gff_files, region_filter=lambda x: x):
        """populate genes table"""

        # clear Genes Table
        if not tableExists(self.cur, "Genes"):
            self.makeGenesTable()

        dups = set()

        util.tic("add genes")
        for sp, gff_file in zip(species, gff_files):
            for region in gff.read_gff(gff_file, regionFilter=region_filter):
                gene = region.data["ID"]
                #gene = row["name"]

                if gene in self.fams.genelookup:
                    famid = self.fams.getFamid(gene)
                    if len(self.fams.getGenes(famid)) < 2:
                        famid = "NONE"
                else:
                    famid = "NONE"

                if gene in dups:
                    continue
                dups.add(gene)

                assert region.start <= region.end

                if gene in self.gene2name:
                    common = self.gene2name[gene]["name"]
                    desc = self.gene2name[gene]["description"]
                else:
                    common = ""
                    desc = ""

                cmd = ("""INSERT INTO Genes VALUES
                         ("%s", "%s", "%s", "%s", %d, %d, %d, "%s", "%s");""" %
                       (gene,
                        common,
                        self.gene2species(gene),
                        region.seqname,
                        region.start,
                        region.end,
                        region.strand,
                        desc.replace('"', ''),
                        famid))

                self.cur.execute(cmd)
        util.toc()
Пример #32
0
    def addGenes(self, species, gff_files, region_filter=lambda x: x):
        """populate genes table"""

        # clear Genes Table
        if not tableExists(self.cur, "Genes"):
            self.makeGenesTable()

        dups = set()

        util.tic("add genes")
        for sp, gff_file in zip(species, gff_files):
            for region in gff.read_gff(gff_file, regionFilter=region_filter):
                gene = region.data["ID"]
                #gene = row["name"]

                if gene in self.fams.genelookup:
                    famid = self.fams.getFamid(gene)
                    if len(self.fams.getGenes(famid)) < 2:
                        famid = "NONE"
                else:
                    famid = "NONE"

                if gene in dups:
                    continue
                dups.add(gene)

                assert region.start <= region.end

                if gene in self.gene2name:
                    common = self.gene2name[gene]["name"]
                    desc = self.gene2name[gene]["description"]
                else:
                    common = ""
                    desc = ""

                cmd = """INSERT INTO Genes VALUES 
                               ("%s", "%s", "%s", "%s", %d, %d, %d, "%s", "%s");""" % \
                            (gene,
                             common,
                             self.gene2species(gene),
                             region.seqname,
                             region.start,
                             region.end,
                             region.strand,
                             desc.replace('"', ''),
                             famid)

                self.cur.execute(cmd)
        util.toc()
Пример #33
0
def resample_mcmc_arg(arg, seqs, ntimes=20,
                      rho=1.5e-8, mu=2.5e-8, popsizes=1e4,
                      refine=1, times=None, verbose=False, carg=False,
                      window=200000, niters2=5):
    """
    Sample ARG for sequences
    """
    if times is None:
        times = argweaver.get_time_points(
            ntimes=ntimes, maxtime=80000, delta=.01)
    if isinstance(popsizes, float) or isinstance(popsizes, int):
        popsizes = [popsizes] * len(times)

    if verbose:
        util.tic("resample arg")

    # convert arg to c++
    if verbose:
        util.tic("convert arg")
    trees, names = arg2ctrees(arg, times)
    if verbose:
        util.toc()

    # get sequences in same order
    # and add all other sequences not in arg yet
    leaves = set(names)
    names = list(names)
    for name in seqs:
        if name not in leaves:
            names.append(name)
    seqs2, nseqs, seqlen = seqs2cseqs(seqs, names)

    # resample arg
    trees = argweaver_resample_mcmc_arg(
        trees, times, len(times),
        popsizes, rho, mu,
        seqs2, nseqs, seqlen, refine, niters2, window)

    if carg:
        arg = (trees, names)
    else:
        # convert arg back to python
        arg = ctrees2arg(trees, names, times, verbose=verbose)

    if verbose:
        util.toc()

    return arg
Пример #34
0
def dndsMatrix(seqs, saveOutput="", verbose=False, safe=True):

    if safe:
        seqs = alignlib.mapalign(seqs, valfunc=removeStopCodons)

    phylip.validate_seqs(seqs)
    cwd = phylip.create_temp_dir()

    util.tic("yn00 on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = phylip.write_phylip_align(file("seqfile.phylip", "w"), seqs)
    util.write_list(file("labels", "w"), labels)

    # create control file
    out = file("yn00.ctl", "w")
    print >> out, "seqfile = seqfile.phylip"
    print >> out, "outfile = outfile"
    out.close()

    # run yn00
    if verbose:
        os.system("yn00 yn00.ctl")
    else:
        os.system("yn00 yn00.ctl > /dev/null")

    try:
        dnmat = phylip.read_dist_matrix("2YN.dN")
        dsmat = phylip.read_dist_matrix("2YN.dS")
    except:
        # could not make distance matrix
        if safe:
            # make dummy matrix
            dnmat = labels, [[0] * len(labels)] * len(labels)
            dsmat = labels, [[0] * len(labels)] * len(labels)
        else:
            raise Exception("could not read dn or ds matrices")

    if saveOutput != "":
        phylip.save_temp_dir(cwd, saveOutput)
    else:
        phylip.cleanup_temp_dir(cwd)

    util.toc()

    return dnmat, dsmat
def dndsMatrix(seqs, saveOutput="", verbose=False, safe=True):
    
    if safe:
        seqs = alignlib.mapalign(seqs, valfunc=removeStopCodons)
    
    phylip.validate_seqs(seqs)
    cwd = phylip.create_temp_dir()

    util.tic("yn00 on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = phylip.write_phylip_align(file("seqfile.phylip", "w"), seqs)
    util.write_list(file("labels", "w"), labels)    
    
    # create control file
    out = file("yn00.ctl", "w")
    print >>out, "seqfile = seqfile.phylip"
    print >>out, "outfile = outfile"
    out.close()
    
    # run yn00
    if verbose:
        os.system("yn00 yn00.ctl")
    else:
        os.system("yn00 yn00.ctl > /dev/null")
    
    try:
        dnmat = phylip.read_dist_matrix("2YN.dN")
        dsmat = phylip.read_dist_matrix("2YN.dS")
    except:
        # could not make distance matrix
        if safe:
            # make dummy matrix
            dnmat = labels, [[0] * len(labels)] * len(labels)
            dsmat = labels, [[0] * len(labels)] * len(labels)
        else:
            raise Exception("could not read dn or ds matrices")
    
    if saveOutput != "":
        phylip.save_temp_dir(cwd, saveOutput)
    else:
        phylip.cleanup_temp_dir(cwd)
    
    util.toc()
    
    return dnmat, dsmat
    def addPfamGenes(self, pfamfile):
        """add pfam domains"""

        if not tableExists(self.cur, "PfamDomains"):
            self.makePfamTable()

        util.tic("add pfam domains")

        pfams = tablelib.read_table(pfamfile)

        for row in pfams:
            name = re.sub("\..*$", "", row["pfam_acc"])

            self.cur.execute("""INSERT INTO GenePfamDomains VALUES
                                ("%s", "%s", %d, %d, %f, %f);""" %
                             (row["locus"], name, row["start"],
                              row["end"], row["score"], row["evalue"]))
        util.toc()
Пример #37
0
    def addPfamGenes(self, pfamfile):
        """add pfam domains"""

        if not tableExists(self.cur, "PfamDomains"):
            self.makePfamTable()

        util.tic("add pfam domains")

        pfams = tablelib.read_table(pfamfile)

        for row in pfams:
            name = re.sub("\..*$", "", row["pfam_acc"])

            self.cur.execute("""INSERT INTO GenePfamDomains VALUES
                                ("%s", "%s", %d, %d, %f, %f);""" %
                             (row["locus"], name, row["start"], row["end"],
                              row["score"], row["evalue"]))
        util.toc()
Пример #38
0
def argweaver_forward_algorithm(arg, seqs, rho=1.5e-8,
                                mu=2.5e-8, popsizes=1e4, times=None,
                                ntimes=20, maxtime=180000,
                                verbose=False,
                                prior=[], internal=False, slow=False):
    if times is None:
        times = argweaver.get_time_points(
            ntimes=ntimes, maxtime=maxtime, delta=.01)
    if isinstance(popsizes, float) or isinstance(popsizes, int):
        popsizes = [popsizes] * len(times)

    probs = []

    if verbose:
        util.tic("forward")

    if is_carg(arg):
        trees, names = arg
    else:
        trees, names = arg2ctrees(arg, times)

    seqs2 = [seqs[node] for node in names]
    for name in seqs.keys():
        if name not in names:
            seqs2.append(seqs[name])
    seqlen = len(seqs2[0])

    fw = argweaver_forward_alg(trees, times, len(times),
                               popsizes, rho, mu,
                               (C.c_char_p * len(seqs2))(*seqs2), len(seqs2),
                               seqlen, len(prior) > 0, prior, internal,
                               slow)

    nstates = [0] * seqlen
    argweaver_get_nstates(trees, len(times), internal, nstates)

    probs = [row[:n] for row, n in zip(fw, nstates)]

    delete_forward_matrix(fw, seqlen)

    if verbose:
        util.toc()

    return probs
Пример #39
0
def boot_neighbor(seqs,
                  iters=100,
                  seed=None,
                  output=None,
                  verbose=True,
                  force=False):

    if seed == None:
        seed = random.randInt(0, 1000) * 2 + 1

    validate_seqs(seqs)
    cwd = create_temp_dir()
    util.tic("boot_neighbor on %d of length %d" %
             (len(seqs), len(seqs.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), seqs)

    exec_phylip("seqboot", "r\n%d\ny\n%d" % (iters, seed), verbose)

    os.rename("outfile", "infile")
    exec_phylip("protdist", "m\nd\n%d\ny" % iters, verbose)

    os.rename("outfile", "infile")
    exec_phylip("neighbor", "m\n%d\n%d\ny" % (iters, seed), verbose)

    util.toc()

    # read tree samples
    if output != None:
        os.rename("outtree", "../" + output)
        cleanup_temp_dir(cwd)
        return labels
    else:
        trees = []
        infile = file("outtree")
        for i in xrange(iters):
            tree = treelib.Tree()
            tree.read_newick(infile)
            rename_tree_with_name(tree, labels)
            trees.append(tree)
        infile.close()
        cleanup_temp_dir(cwd)
        return trees
Пример #40
0
def est_popsizes_trees(arg, times, step, verbose=False):

    if verbose:
        util.tic("convert arg")
    trees, names = arg2ctrees(arg, times)

    if verbose:
        util.toc()
        util.tic("estimate popsizes")

    popsizes = [0.0] * (len(times) - 1)
    argweaver_est_popsizes_trees(trees, times, len(times), step, popsizes)

    if verbose:
        util.toc()

    if not is_carg(arg):
        delete_local_trees(trees)

    return popsizes
Пример #41
0
def boot_neighbor(seqs, iters=100, seed=None, output=None, 
                 verbose=True, force=False):
    
    if seed == None:
        seed = random.randInt(0, 1000) * 2 + 1
    
    validate_seqs(seqs)
    cwd = create_temp_dir()
    util.tic("boot_neighbor on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), seqs)
    
    exec_phylip("seqboot", "r\n%d\ny\n%d" % (iters, seed), verbose)
    
    os.rename("outfile", "infile")
    exec_phylip("protdist", "m\nd\n%d\ny" % iters, verbose)
    
    os.rename("outfile", "infile")
    exec_phylip("neighbor", "m\n%d\n%d\ny" % (iters, seed), verbose)

    util.toc()        
    
    # read tree samples
    if output != None:
        os.rename("outtree", "../" + output)
        cleanup_temp_dir(cwd)
        return labels
    else:
        trees = []
        infile = file("outtree")
        for i in xrange(iters):
            tree = treelib.Tree()
            tree.read_newick(infile)
            rename_tree_with_name(tree, labels)
            trees.append(tree)
        infile.close()
        cleanup_temp_dir(cwd)
        return trees
Пример #42
0
def boot_proml(seqs,
               iters=100,
               seed=1,
               jumble=5,
               output=None,
               verbose=True,
               force=False):
    validate_seqs(seqs)
    cwd = create_temp_dir()
    util.tic("bootProml on %d of length %d" %
             (len(seqs), len(seqs.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), seqs)

    exec_phylip("seqboot", "y\n%d" % seed, verbose)

    os.rename("outfile", "infile")
    exec_phylip("proml", "m\nD\n%d\n%d\n%d\ny" % (iters, seed, jumble),
                verbose)

    util.toc()

    # read tree samples
    if output != None:
        os.rename("outtree", "../" + output)
        cleanup_temp_dir(cwd)
        return labels
    else:
        trees = []
        infile = file("outtree")
        for i in xrange(iters):
            tree = treelib.Tree()
            tree.read_newick(infile)
            rename_tree_with_names(tree, labels)
            trees.append(tree)
        infile.close()
        cleanup_temp_dir(cwd)
        return trees
Пример #43
0
def sample_arg(seqs, ntimes=20, rho=1.5e-8, mu=2.5e-8, popsizes=1e4,
               refine=0, nremove=1, times=None, verbose=False,
               carg=False):
    """
    Sample ARG for sequences
    """
    if times is None:
        times = argweaver.get_time_points(
            ntimes=ntimes, maxtime=80000, delta=.01)
    if isinstance(popsizes, float) or isinstance(popsizes, int):
        popsizes = [popsizes] * len(times)

    if verbose:
        util.tic("sample arg")

    names = []
    seqs2 = []
    for name, seq in seqs.items():
        names.append(name)
        seqs2.append(seq)

    # sample arg
    trees = argweaver_sample_arg_refine(
        times, len(times),
        popsizes, rho, mu,
        (C.c_char_p * len(seqs))(*seqs2), len(seqs), len(seqs2[0]), refine,
        nremove)

    if carg:
        arg = (trees, names)
    else:
        # convert to python
        arg = ctrees2arg(trees, names, times, verbose=verbose)

    if verbose:
        util.toc()

    return arg
Пример #44
0
def test_forward():

    k = 4
    n = 1e4
    rho = 1.5e-8 * 20
    mu = 2.5e-8 * 20
    length = int(100e3 / 20)
    times = argweaver.get_time_points(ntimes=100)

    arg = arglib.sample_arg_smc(k, 2 * n, rho, start=0, end=length)
    muts = arglib.sample_arg_mutations(arg, mu)
    seqs = arglib.make_alignment(arg, muts)

    print "muts", len(muts)
    print "recomb", len(arglib.get_recomb_pos(arg))

    argweaver.discretize_arg(arg, times)

    # remove chrom
    new_name = "n%d" % (k - 1)
    arg = argweaver.remove_arg_thread(arg, new_name)

    carg = argweaverc.arg2ctrees(arg, times)

    util.tic("C fast")
    probs1 = argweaverc.argweaver_forward_algorithm(carg, seqs, times=times)
    util.toc()

    util.tic("C slow")
    probs2 = argweaverc.argweaver_forward_algorithm(carg,
                                                    seqs,
                                                    times=times,
                                                    slow=True)
    util.toc()

    for i, (col1, col2) in enumerate(izip(probs1, probs2)):
        for a, b in izip(col1, col2):
            fequal(a, b, rel=.0001)
Пример #45
0
def proml_treelk(aln, tree, verbose=True, force=False, args="u\ny"):
    validate_seqs(aln)
    cwd = create_temp_dir()

    util.tic("proml on %d of length %d" % (len(aln), len(aln.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), aln)
    write_in_tree("intree", tree, labels)

    # run phylip
    exec_phylip("proml", args, verbose)

    # parse logl
    logl = read_logl("outfile")

    # parse tree
    tree = read_out_tree("outtree", labels)

    cleanup_temp_dir(cwd)
    util.toc()

    return logl, tree
Пример #46
0
def proml_treelk(aln, tree, verbose=True, force = False, args="u\ny"):
    validate_seqs(aln)
    cwd = create_temp_dir()

    util.tic("proml on %d of length %d" % (len(aln), len(aln.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), aln)
    write_in_tree("intree", tree, labels)
    
    # run phylip
    exec_phylip("proml", args, verbose)
    
    # parse logl
    logl = read_logl("outfile")
    
    # parse tree
    tree = read_out_tree("outtree", labels)
    
    cleanup_temp_dir(cwd)
    util.toc()
    
    return logl, tree
Пример #47
0
    def recon(self, nsearch=1000):
        """Perform reconciliation"""
        
        self.init_search()
        proposal = self.proposer.init_proposal()
        self.maxrecon = proposal.copy()
        for i in xrange(nsearch):
            if i % 10 == 0:
                print "search", i

            util.tic("eval")
            p = self.eval_proposal(proposal)
            util.toc()

            util.tic("prop")
            self.eval_search(p, proposal)
            proposal = self.proposer.next_proposal()
            util.toc()
        
        # rename locus tree nodes
        dlcoal.rename_nodes(self.maxrecon.locus_tree, self.name_internal)
        
        return self.maxrecon
Пример #48
0
def resample_arg_regions(arg, seqs, niters, width=1000,
                         ntimes=20, rho=1.5e-8, mu=2.5e-8,
                         popsize=1e4, times=None, carg=False,
                         verbose=False):
    seqlen = len(seqs.values()[0])

    if is_carg(arg):
        trees, names = arg
        arg2 = ctrees2arg(trees, names, times, verbose=verbose,
                          delete_arg=False)
        recomb_pos = list(x.pos for x in arg2 if x.event == "recomb")
    else:
        recomb_pos = list(x.pos for x in arg if x.event == "recomb")

    for it in range(niters):
        maxr = 0
        for i, j, a, b in stats.iter_window_index(recomb_pos, width):
            r = j - i + 1
            if r > maxr:
                maxr = r
                region = [max(recomb_pos[i]-10, 10),
                          min(recomb_pos[j]+10, seqlen - 10)]

        if verbose:
            util.tic("sample ARG region %s" % region)
        print arg
        arg = argweaver.resample_arg_region(arg, seqs, region[0], region[1],
                                            rho=rho, mu=mu, times=times,
                                            carg=carg, verbose=True)
        if not carg:
            recomb_pos = list(x.pos for x in arg if x.event == "recomb")
            if verbose:
                util.logger("%d: # recombs %d" % (it, len(recomb_pos)))
        if verbose:
            util.toc()

    return arg
    def draw_placed(self):
        vis = []
        
        util.tic("create draw code")
        
        # draw frags
        for frag in self.frags:
            vis.append(self.frag_widget(frag))

        # draw genes
        for reg, l in self.region_layout.iteritems():
            vis.append(translate(l.x, l.y, 
                                 self.gene_widget(self.db.get_region(reg))))
        
        # draw matches
        drawn = set()
        for frag in self.frags:
            vis.append(self.draw_matches(frag.genome, frag.chrom,
                                         frag.start, frag.end, drawn))
        
        util.toc()

        self.groupid = group(*vis)
        return self.groupid
Пример #50
0
# check arguments

if options.niter < 1:
    parser.error("--niter must be >= 1: %d" % options.niter)

if len(args) != 1:
    parser.error("must specify input file")

# =============================
# main file

treefile = args[0]
seqfile = util.replace_ext(treefile, options.treeext, options.alignext)
out = util.open_stream(options.output, "w")

util.tic("Initializing RAXML and optimizing...")
module = raxml.RAxML()
module.optimize_model(treefile, seqfile, options.extra)
util.toc()

tree = treelib.read_tree(treefile)
for node in tree:
    node.dist = 0
    if "boot" in node.data:
        del node.data["boot"]
treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))
treehashes = set([treehash])

for i in xrange(options.niter):
    while treehash in treehashes:
        util.log("random spr")
Пример #51
0
def align2tree(prog,
               seqs,
               verbose=True,
               force=False,
               args=None,
               usertree=None,
               saveOutput="",
               bootiter=1,
               seed=1,
               jumble=1):
    validate_seqs(seqs)
    cwd = create_temp_dir()

    util.tic("%s on %d of length %d" %
             (prog, len(seqs), len(seqs.values()[0])))

    # create input
    labels = write_phylip_align(file("infile", "w"), seqs)
    util.write_list(file("labels", "w"), labels)

    # initialize default arguments
    if args == None:
        args = "y"

    # create user tree if given
    if usertree != None:
        write_in_tree("intree", usertree, labels)
        args = "u\n" + args  # add user tree option

    # bootstrap alignment if needed
    if bootiter > 1:
        exec_phylip("seqboot", "r\n%d\ny\n%d" % (bootiter, seed), verbose)
        os.rename("outfile", "infile")

        # add bootstrap arguments
        args = "m\nD\n%d\n%d\n%d\n%s" % (bootiter, seed, jumble, args)

    # run phylip
    exec_phylip(prog, args, verbose)

    # check for PHYLIP GIVE UP
    if is_phylip_give_up("outfile"):
        tree = treelib.Tree()
        tree.make_root()

        # make star tree
        for key in seqs:
            tree.add_child(tree.root, treelib.TreeNode(key))

    else:
        # parse tree
        if bootiter == 1:
            tree = read_out_tree("outtree", labels, bootiter)

            # parse likelihood
            if prog in ["dnaml", "proml"]:
                tree.data["logl"] = read_logl("outfile")

        else:
            trees = read_out_tree("outtree", labels, bootiter)

    if saveOutput != "":
        save_temp_dir(cwd, saveOutput)
    else:
        cleanup_temp_dir(cwd)

    util.toc()

    if bootiter == 1:
        return tree
    else:
        return trees
Пример #52
0
 def wrapper(*args, **kwargs):
     util.tic(func.__name__)
     result = func(*args, **kwargs)
     util.toc()
     return result
Пример #53
0
def mergeBuh(conf, genes, parts1, parts2, blastfiles):
    """Merge by Best Unidirectional Hits"""

    # don't use this code without double checking it
    assert False

    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)

    best = util.Dict(dim=1, default=(0, None))

    util.tic("read hits")
    for blastfile, order in blastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coverage = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coverage < conf["coverage"] or \
               blast.evalue(hit) > conf["signif"]:
                continue

            #if blast.evalue(hit) > conf["signif"]:
            #    continue

            if gene1 in lookup1:
                part1 = (0, lookup1[gene1])
            else:
                parts1.append([gene1])
                lookup1[gene1] = len(parts1) - 1
                part1 = (0, len(parts1) - 1)

            if gene2 in lookup2:
                part2 = (1, lookup2[gene2])
            else:
                parts2.append([gene2])
                lookup2[gene2] = len(parts2) - 1
                part2 = (1, len(parts2) - 1)

            if score > best[part1][0]:
                best[part1] = (score, part2)
            if score > best[part2][0]:
                best[part2] = (score, part1)
        util.toc()

    util.toc()

    util.tic("determine clusters")
    sets = {}
    for gene in best:
        sets[gene] = sets.UnionFind([gene])

    for blastfile, order in blastfiles:
        util.tic("read hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coverage = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coverage < conf["coverage"] or \
               blast.evalue(hit) > conf["signif"]:
                continue

            #if blast.evalue(hit) > conf["signif"]:
            #    continue

            part1 = (0, lookup1[gene1])
            part2 = (1, lookup2[gene2])

            if score >= best[part1][0] * conf["relcutoff"]:
                sets[part1].union(sets[part2])
            if score >= best[part2][0] * conf["relcutoff"]:
                sets[part2].union(sets[part1])
        util.toc()

    sets = util.unique([x.root() for x in sets.values()])

    parts = []
    joining = (parts1, parts2)
    for set in sets:
        parts.append([])
        for i, row in set.members():
            parts[-1].extend(joining[i][row])
    util.toc()

    return parts
Пример #54
0
def mergeAvg(conf, genes, parts1, parts2, blastfiles, outblastfiles):
    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)

    # value is [sum, total]
    hits = util.Dict(dim=2, default=[0, 0])

    if "accept" in conf:
        accept = conf["accept"]
    else:
        accept = False

    util.tic("read hits")
    for blastfile, order in blastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coveragesmall = min(alnlen1 / float(len1), alnlen2 / float(len2))
            coveragebig = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coveragesmall < conf["coveragesmall"] or \
               coveragebig < conf["coveragebig"] or \
               blast.evalue(hit) > conf["signif"]:
                continue


            if accept and \
               (gene1 not in accept or
                gene2 not in accept):
                continue

            # create a key for a partition: (side, index)
            if gene1 in lookup1:
                part1 = (0, lookup1[gene1])
            else:
                parts1.append([gene1])
                lookup1[gene1] = len(parts1) - 1
                part1 = (0, len(parts1) - 1)

            if gene2 in lookup2:
                part2 = (1, lookup2[gene2])
            else:
                parts2.append([gene2])
                lookup2[gene2] = len(parts2) - 1
                part2 = (1, len(parts2) - 1)

            val = hits[part1][part2]
            val[0] += score
            val[1] += 1
            hits[part2][part1] = val

        util.toc()
    util.toc()

    util.tic("read outgroup hits")
    outbest = util.Dict(default=[0, 0])
    for blastfile, order in outblastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                genein = blast.query(hit)
                geneout = blast.subject(hit)
            else:
                geneout = blast.query(hit)
                genein = blast.subject(hit)
            score = blast.bitscore(hit)

            # create a key for a partition: (side, index)
            if genein in lookup1:
                partin = (0, lookup1[genein])
            elif gene1 in lookup2:
                partin = (1, lookup2[genein])
            else:
                continue

            val = outbest[partin]
            val[0] += score
            val[1] += 1

        util.toc()
    util.toc()

    assert len(parts1) == len(unionPart(parts1))
    assert len(parts2) == len(unionPart(parts2))

    util.tic("determine clusters")
    sets = {}
    for i in xrange(len(parts1)):
        sets[(0, i)] = sets.UnionFind([(0, i)])
    for i in xrange(len(parts2)):
        sets[(1, i)] = sets.UnionFind([(1, i)])

    # merge top avg hits
    for part1 in hits:
        o1 = outbest[part1]
        outavg1 = float(o1[0]) / max(o1[1], 1)

        top = 0
        toppart = None

        for part2, (tot, num) in hits[part1].iteritems():
            avg = float(tot) / num
            o2 = outbest[part2]
            outavg2 = float(o2[0]) / max(o2[1], 1)

            if avg > outavg1 and avg > outavg2 and avg > top:
                top = avg
                toppart = part2

        if toppart:
            sets[part1].union(sets[toppart])

    sets = util.unique([x.root() for x in sets.values()])

    # create partition of genes
    parts = []
    joining = (parts1, parts2)
    for set in sets:
        parts.append([])
        for i, row in set:
            parts[-1].extend(joining[i][row])
    util.toc()

    assert len(parts) == len(unionPart(parts))

    return parts
Пример #55
0
def mrbayes(aln,
            nexfilename="",
            seqtype="pep",
            options=None,
            usertree=None,
            bootiter=0,
            verbose=True,
            saveOutput=""):
    util.tic("mrbayes on %d of length %d" % (len(aln), len(aln.values()[0])))

    if nexfilename == "":
        cwd = phylip.create_temp_dir()
    else:
        cwd = None

    # setup options
    if nexfilename == "":
        nexfilename = "infile.nex"
    if not options:
        options = {}
    setDefaultOptions(options)

    options["burninfrac"] = .25
    options["relburnin"] = "yes"

    # force best binary tree (if possible)
    options["extra"] += "sumt contype=allcompat;"

    # get gene names
    names = []
    namemap = {}

    for key in aln.keys():
        if "+" in key:
            key2 = key.replace("+", "_")
            names.append(key2)
            namemap[key2] = key
        else:
            names.append(key)

    # write input file
    out = file(nexfilename, "w")
    writeNexus(out, names, aln.values(), seqtype, options)

    # write options
    writeMrbayesOptions(out, options, seqtype=seqtype)
    out.close()

    # exec mrbayes
    if verbose:
        os.system("echo exe %s | mb" % nexfilename)
    else:
        os.system("echo exe %s | mb >/dev/null 2>&1" % nexfilename)

    # read tree
    tree = readNexusConTree(file(nexfilename + ".con"))

    # clean up
    if cwd != None:
        if saveOutput != "":
            phylip.save_temp_dir(cwd, saveOutput)
        else:
            phylip.cleanup_temp_dir(cwd)

    util.toc()

    for tmpname, origname in namemap.iteritems():
        tree.rename(tmpname, origname)

    return tree
Пример #56
0
    #################################################
    # timing
    if 0:
        from rasmus import util

        text = [
            "##types:" + "int\t" * 99 + "int", "\t".join(map(str, range(100)))
        ]

        for i in range(10000):
            text.append("1\t" * 99 + "1")
        text = "\n".join(text)

        stream = StringIO.StringIO(text)

        util.tic("read table")
        tab = readTable(stream)
        util.toc()

    #################################################
    # specialized types
    if 1:
        text = """\
##types:str	int	strand_type
name	num	strand
matt	123	+
alex	456	-
mike	789	+
john	0	+
"""
Пример #57
0
    T2 = treelib.unroot(T)
    treelib.draw_tree(T2, out=sys.stdout, minlen=5, maxlen=5)
    util.toc()


treefile = args[0]
seqfile = util.replace_ext(treefile, options.treeext, options.alignext)
out = util.open_stream(options.output, "w")

adef = raxml.new_analdef()
raxml.init_adef(adef)
tr = raxml.new_tree()
cmd = "raxmlHPC -t %s -s %s %s" % (treefile, seqfile, options.extra)
raxml.init_program(adef, tr, cmd.split(" "))

util.tic("Optimizing model...")
raxml.optimize_model(adef, tr)
util.toc()

# draw_raxml_tree(tr, adef)

util.tic("Getting parameters for LH...")
bestVector, bestLH, weightSum = raxml.compute_best_LH(tr)
util.log("bestLH: %.3f" % bestLH)
util.toc()

tree = treelib.read_tree(treefile)
for node in tree:
    node.dist = 0
    if "boot" in node.data:
        del node.data["boot"]
Пример #58
0
            self.window_size = (0, 0)
            self.window_pos = (0, 0)
            self.vsash_pos = 0
            self.hsash_pos = 0
            self.apps = []
            self.apps2 = []

        def read(self, filename):
            parser.read(self, filename)

        def write(self, filename):
            parser.write(self, filename)

    from rasmus import util

    util.tic("run")

    infile = StringIO.StringIO("""<?xml version="1.0" encoding="UTF-8"?>
       <notebook>
       <window_size>1053,905</window_size>
<window_pos>0,0</window_pos>
<vsash_pos>0</vsash_pos>
<hsash_pos>250</hsash_pos>
<external_apps>
<app>web_browser</app>
<app>image_editor</app>
</external_apps>
<external_apps2>
<app><name>web_browser</name><prog>firefox</prog></app>
<app><name>image_editor</name><prog>gimp</prog></app>
</external_apps2>
Пример #59
0
def phyml(seqs,
          verbose=True,
          args=None,
          usertree=None,
          seqtype="pep",
          saveOutput="",
          bootiter=0,
          opttree=True,
          optbranches=True,
          nrates=4):

    phylip.validate_seqs(seqs)
    cwd = phylip.create_temp_dir()

    util.tic("phyml on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = phylip.write_phylip_align(file("infile", "w"), seqs)
    util.write_list(file("labels", "w"), labels)

    options = "y"

    # only bootstrap when iterations are above 1
    if bootiter == 1:
        bootiter = 0

    if usertree != None:
        usertree = treelib.unroot(usertree)
        phylip.write_in_tree("intree", usertree, labels)
        treefile = "intree"
    else:
        treefile = "BIONJ"

    optimize = ""
    if opttree:
        optimize += "y "
    else:
        optimize += "n "

    if optbranches:
        optimize += "y "
    else:
        optimize += "n "

    if args == None:
        if seqtype == "dna":
            args = "infile 0 s 1 %d HKY e e %d e %s %s" % \
                (bootiter, nrates, treefile, optimize)
        elif seqtype == "pep":
            args = "infile 1 s 1 %d JTT e %d e %s %s" % \
                (bootiter, nrates, treefile, optimize)
        else:
            assert False, "unknown sequence type '%s'" % seqtype

    phylip.exec_phylip("phyml %s" % args, options, verbose)

    # parse tree
    tree = phylip.read_out_tree("infile_phyml_tree.txt", labels)

    # parse likelihood
    tree.data["logl"] = float(file("infile_phyml_lk.txt").read())

    if saveOutput != "":
        phylip.save_temp_dir(cwd, saveOutput)
    else:
        phylip.cleanup_temp_dir(cwd)
    util.toc()

    return tree
Пример #60
0
    def update(self):
        self.pos += 1
        if (self.pos > self.prog):
            self.prog += int(self.step * self.end)
            self.printBar()
            self.pad.refresh()

            if self.pos == self.end:
                self.pad.addstr(1, 1 + self.bar, "|\n")

                import curses
                curses.endwin()

    def printBar(self):
        amount = int((self.pos / self.end * self.width) - self.bar)
        self.pad.addstr(1, 1 + self.bar, "*" * amount)
        self.bar += amount


if __name__ == "__main__":
    import time

    util.tic("hi")

    prog = FancyProgressBar(100)
    for i in range(100):
        time.sleep(.01)
        prog.update()
    util.toc()