Пример #1
0
 def load_stored_data(self):
     self.cogs = db.get_task_data(self.taskid, DATATYPES.cogs)
     self.cog_analysis = db.get_task_data(self.taskid,
                                          DATATYPES.cog_analysis)
Пример #2
0
 def load_stored_data(self):
     self.best_model = db.get_task_data(self.taskid, DATATYPES.best_model)
     self.model_ranking = db.get_task_data(self.taskid,
                                           DATATYPES.model_ranking)
Пример #3
0
 def load_stored_data(self):
     # self.tree_file = db.get_dataid(self.taskid, DATATYPES.tree)
     self.stats = db.get_task_data(self.taskid, DATATYPES.tree_stats)
Пример #4
0
def split_tree(task_tree_node, task_outgroups, main_tree, alg_path, npr_conf,
               threadid, target_cladeids):
    """Browses a task tree from root to leaves and yields next
    suitable nodes for NPR iterations. Each yielded node comes with
    the set of target and outgroup tips.
    """
    def processable_node(_n):
        """This an internal function that returns true if a given node
        is suitable for a NPR iteration. It can be used as
        "is_leaf_fn" when traversing a tree.

        Note that this function uses several variables which change within the
        split_tree function, so must be kept within its namespace.

        """
        is_leaf = False
        for wkname, wkfilter in npr_conf.npr_workflows:
            # if node is not in the targets or does not meet size filters, skip
            # workflow
            if _n is master_node or \
               (_TARGET_NODES and _n not in _TARGET_NODES) or \
               (target_cladeids and _n.cladeid not in target_cladeids) or \
               len(n2content[_n]) < max(wkfilter.get("min_size", 3), 3) or \
               ("max_size" in wkfilter and len(n2content[_n]) > wkfilter["max_size"]):
                continue

            # If seq_sim filter used, calculate node stats
            if ALG and ("min_seq_sim" in wkfilter
                        or "max_seq_sim" in wkfilter):
                if not hasattr(_n, "seqs_mean_ident"):
                    log.log(20, "Calculating node sequence stats...")
                    mx, mn, avg, std = get_seqs_identity(
                        ALG, [__n.name for __n in n2content[_n]])
                    _n.add_features(seqs_max_ident=mx,
                                    seqs_min_ident=mn,
                                    seqs_mean_ident=avg,
                                    seqs_std_ident=std)
                    log.log(
                        20,
                        "mx=%s, mn=%s, avg=%s, std=%s" % (mx, mn, avg, std))

                if _n.seqs_mean_ident < wkfilter["min_seq_sim"]:
                    continue

                if _n.seqs_mean_ident > wkfilter["max_seq_sim"]:
                    continue

            else:
                _n.add_features(seqs_max_ident=None,
                                seqs_min_ident=None,
                                seqs_mean_ident=None,
                                seqs_std_ident=None)

            if "min_support" in wkfilter:
                # If we are optimizing only lowly supported nodes, and nodes are
                # optimized without an outgroup, our target node is actually the
                # parent of lowly supported nodes. Therefore, I check if support
                # is low in children nodes, and return this node if so.
                if not npr_conf.use_outgroup:
                    if not [
                            _ch for _ch in _n.children
                            if _ch.support <= wkfilter["min_support"]
                    ]:
                        continue
                # Otherwise, just skip the node if it above the min support
                elif _n.support > wkfilter["min_support"]:
                    continue

            # At this point, node passed all filters of this workflow were met,
            # so it can be optimized
            is_leaf = True
            _n._target_wkname = wkname
            break

        return is_leaf

    log.log(20, "Loading tree content...")
    n2content = main_tree.get_cached_content()
    if alg_path:
        log.log(20, "Loading associated alignment to check seq. similarity")
        raw_alg = db.get_task_data(*alg_path.split("."))
        ALG = SeqGroup(raw_alg)
    else:
        ALG = None

    log.log(20, "Finding next NPR nodes...")
    # task_tree_node is actually a node in main_tree, since it has been
    # already merged
    trees_to_browse = [task_tree_node]
    npr_nodes = 0
    # loads current tree content, so we can check not reconstructing exactly the
    # same tree
    tasktree_content = set([leaf.name for leaf in n2content[task_tree_node]
                            ]) | set(task_outgroups)
    while trees_to_browse:
        master_node = trees_to_browse.pop()

        # if custom taxa levels are defined as targets, find them in this
        # subtree
        _TARGET_NODES = defaultdict(list)  # this container is used by
        # processable_node function
        opt_levels = GLOBALS[threadid].get('_optimized_levels', None)
        if opt_levels is not None:
            # any descendant of the already processed node is suitable for
            # selection. If the ancestor of level-species is on top of the
            # task_tree_node, it will be discarded
            avail_nodes = set(master_node.get_descendants())
            for lin in opt_levels:
                sp2lin, lin2sp = GLOBALS["lineages"]
                optimized, strict_monophyly = opt_levels[lin]
                if not optimized:
                    ancestor = main_tree.get_common_ancestor(*lin2sp[lin])
                    if ancestor in avail_nodes:
                        # check that the node satisfies level monophyly config
                        ancestor_content = set(
                            [x.name for x in n2content[ancestor]])
                        if not strict_monophyly or lin2sp[
                                lin] == ancestor_content:
                            _TARGET_NODES[ancestor].append(lin)
                        elif strict_monophyly:
                            log.log(
                                26,
                                "Discarding not monophyletic level @@11:%s@@1:"
                                % lin)
                    else:
                        log.log(26, "Discarding upper clade @@11:%s@@1:" % lin)

        for node in master_node.iter_leaves(is_leaf_fn=processable_node):
            if opt_levels:
                log.log(
                    28, "Trying to optimizing custom tree level: @@11:%s@@1:" %
                    _TARGET_NODES[node])
                for lin in _TARGET_NODES[node]:
                    # Marks the level as optimized, so is not computed again
                    opt_levels[lin][0] = True

            log.log(
                28, "Found possible target node of size %s branch support %f" %
                (len(n2content[node]), node.support))
            log.log(28, "First suitable workflow: %s" % (node._target_wkname))

            # Finds best outgroup for the target node
            if npr_conf.use_outgroup:
                splitterconfname, _ = npr_conf.tree_splitter
                splitterconf = GLOBALS[threadid][splitterconfname]
                #seqs, outs = select_outgroups(node, n2content, splitterconf)
                #seqs, outs = select_closest_outgroup(node, n2content, splitterconf)
                seqs, outs = select_sister_outgroup(node, n2content,
                                                    splitterconf)
            else:
                seqs = set([_i.name for _i in n2content[node]])
                outs = set()

            if seqs | outs == tasktree_content:
                log.log(
                    26,
                    "Discarding target node of size %s, due to identity with its parent node"
                    % len(n2content[node]))
                #print tasktree_content
                #print seqs
                #print outs
                trees_to_browse.append(node)
            else:
                npr_nodes += 1
                yield node, seqs, outs, node._target_wkname
    log.log(28, "%s nodes will be optimized", npr_nodes)
Пример #5
0
 def load_stored_data(self):
     self.kept_columns[:] = []  # clear list
     self.kept_columns.append(
         db.get_task_data(self.taskid, DATATYPES.kept_alg_columns))
Пример #6
0
def process_task(task, wkname, npr_conf, nodeid2info):
    alignerconf, alignerclass = npr_conf.aligner
    cleanerconf, cleanerclass = npr_conf.alg_cleaner
    mtesterconf, mtesterclass = npr_conf.model_tester
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    if not treebuilderclass:
        # Allows to dump algs in workflows with no tree tasks
        treebuilderclass = DummyTree

    splitterconf, splitterclass = npr_conf.tree_splitter

    conf = GLOBALS[task.configid]
    seqtype = task.seqtype
    nodeid = task.nodeid
    ttype = task.ttype
    taskid = task.taskid
    threadid = task.threadid
    node_info = nodeid2info[nodeid]
    size = task.size#node_info.get("size", 0)
    target_seqs = node_info.get("target_seqs", [])
    out_seqs = node_info.get("out_seqs", [])

    if not treebuilderclass or size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree

    # If more than one outgroup are used, enable the use of constrain
    if out_seqs and len(out_seqs) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None

    new_tasks = []
    if ttype == "msf":
        # Register Tree constrains
        constrain_tree = "(%s, (%s));" %(','.join(sorted(task.out_seqs)),
                                         ','.join(sorted(task.target_seqs)))
        _outs = "\n".join([">%s\n0" %name for name in sorted(task.out_seqs)])
        _tars = "\n".join([">%s\n1" %name for name in sorted(task.target_seqs)])
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit() # since the creation of some Task
                               # objects may require this info, I need
                               # to commit right now.

        # Register node
        db.add_node(task.threadid,
                    task.nodeid, task.cladeid,
                    task.target_seqs,
                    task.out_seqs)

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = task.target_seqs
        nodeid2info[nodeid]["out_seqs"] = task.out_seqs
        alg_task = alignerclass(nodeid, task.multiseq_file,
                                seqtype, conf, alignerconf)
        alg_task.size = task.size
        new_tasks.append(alg_task)


    elif ttype == "alg" or ttype == "acleaner":
        if ttype == "alg":
            nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file
        elif ttype == "acleaner":
            nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file

        alg_fasta_file = getattr(task, "clean_alg_fasta_file",
                                 task.alg_fasta_file)
        alg_phylip_file = getattr(task, "clean_alg_phylip_file",
                                  task.alg_phylip_file)

        # Calculate alignment stats
        # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file,
        #                                        conf["app"]["trimal"])
        #
        # max_identity = get_trimal_identity(task.alg_fasta_file,
        #                                 conf["app"]["trimal"])
        # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std)
        # log.info("Max. Identity: %0.2f", max_identity)
        #import time
        #t1 = time.time()
        #mx, mn, mean, std = get_identity(task.alg_fasta_file)
        #print time.time()-t1
        #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        #t1 = time.time()

        if seqtype == "aa" and npr_conf.switch_aa_similarity < 1:
            try:
                alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats)
            except Exception as e:
                alg_stats = {}

            if ttype == "alg":
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                dataid = DATATYPES.alg_phylip
            elif ttype == "acleaner":
                algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file)
                dataid = DATATYPES.clean_alg_phylip

            if "i_mean" not in alg_stats:
                log.log(24, "Calculating alignment stats...")
                # dump data if necesary
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                if not pexist(algfile):
                    # dump phylip alg
                    open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid)))

                mx, mn, mean, std = get_statal_identity(algfile,
                                                        conf["app"]["statal"])
                alg_stats = {"i_max":mx, "i_mean":mean, "i_min":mn, "i_std":std}
                db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats)

            log.log(22, "Alignment stats (sequence similarity):")
            log.log(22, "   max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" %
                    (alg_stats))

        else:
            alg_stats = {"i_max":-1, "i_mean":-1, "i_min":-1, "i_std":-1}

        #print time.time()-t1
        #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        task.max_ident = alg_stats["i_max"]
        task.min_ident = alg_stats["i_min"]
        task.mean_ident = alg_stats["i_mean"]
        task.std_ident = alg_stats["i_std"]
        next_task = None

        if ttype == "alg" and cleanerclass:
            next_task = cleanerclass(nodeid, seqtype, alg_fasta_file,
                                     alg_phylip_file,
                                     conf, cleanerconf)
        else:
            # Converts aa alignment into nt if necessary
            if  seqtype == "aa" and \
                    "nt" in GLOBALS["seqtypes"] and \
                    task.mean_ident >= npr_conf.switch_aa_similarity:
                log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\
                        (task.mean_ident, npr_conf.switch_aa_similarity))
                alg_fasta_file = "%s.%s" %(taskid, DATATYPES.alg_nt_fasta)
                alg_phylip_file = "%s.%s" %(taskid, DATATYPES.alg_nt_phylip)
                try:
                    alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta)
                    alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip)
                except ValueError:
                    log.log(22, "Calculating codon alignment...")

                    source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file)
                    if ttype == "alg":
                        kept_columns = []
                    elif ttype == "acleaner":
                        # if original alignment was trimmed, use it as reference
                        # but make the nt alignment only on the kept columns
                        kept_columns = db.get_task_data(taskid, DATATYPES.kept_alg_columns)

                    if not pexist(source_alg):
                        open(source_alg, "w").write(db.get_task_data(taskid, DATATYPES.alg_fasta))

                    nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns)
                    db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write())
                    db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed'))

                npr_conf = IterConfig(conf, wkname, task.size, "nt")
                seqtype = "nt"

            if mtesterclass:
                next_task = mtesterclass(nodeid, alg_fasta_file,
                                         alg_phylip_file,
                                         constrain_id,
                                         conf, mtesterconf)
            elif treebuilderclass:
                next_task = treebuilderclass(nodeid, alg_phylip_file,
                                             constrain_id,
                                             None, seqtype,
                                             conf, treebuilderconf)
        if next_task:
            next_task.size = task.size
            new_tasks.append(next_task)

    elif ttype == "mchooser":
        if treebuilderclass:
            alg_fasta_file = task.alg_fasta_file
            alg_phylip_file = task.alg_phylip_file
            model = task.best_model
            tree_task = treebuilderclass(nodeid, alg_phylip_file,
                                         constrain_id,
                                         model, seqtype,
                                         conf, treebuilderconf)
            tree_task.size = task.size
            new_tasks.append(tree_task)

    elif ttype == "tree":
        treemerge_task = splitterclass(nodeid, seqtype,
                                       task.tree_file, conf, splitterconf)
            #if conf["tree_splitter"]["_outgroup_size"]:
            #    treemerge_task = TreeSplitterWithOutgroups(nodeid, seqtype, task.tree_file, main_tree, conf)
            #else:
            #    treemerge_task = TreeSplitter(nodeid, seqtype, task.tree_file, main_tree, conf)

        treemerge_task.size = task.size
        new_tasks.append(treemerge_task)

    elif ttype == "treemerger":
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid,
                       runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree
                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(target_seqs), len(out_seqs))
                alg_path = node_info.get("clean_alg_path", node_info["alg_path"])
                for node, seqs, outs, wkname in get_next_npr_node(threadid, ttree,
                                                          task.out_seqs, mtree,
                                                          alg_path, npr_conf):
                    log.log(24, "Registering new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = Msf(seqs, outs, seqtype=source_seqtype)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
    return new_tasks
Пример #7
0
def process_task(task, wkname, npr_conf, nodeid2info):
    alignerconf, alignerclass = npr_conf.aligner
    cleanerconf, cleanerclass = npr_conf.alg_cleaner
    mtesterconf, mtesterclass = npr_conf.model_tester
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    if not treebuilderclass:
        # Allows to dump algs in workflows with no tree tasks
        treebuilderclass = DummyTree

    splitterconf, splitterclass = npr_conf.tree_splitter

    conf = GLOBALS[task.configid]
    seqtype = task.seqtype
    nodeid = task.nodeid
    ttype = task.ttype
    taskid = task.taskid
    threadid = task.threadid
    node_info = nodeid2info[nodeid]
    size = task.size  #node_info.get("size", 0)
    target_seqs = node_info.get("target_seqs", [])
    out_seqs = node_info.get("out_seqs", [])

    if not treebuilderclass or size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree

    # If more than one outgroup are used, enable the use of constrain
    if out_seqs and len(out_seqs) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None

    new_tasks = []
    if ttype == "msf":
        # Register Tree constrains
        constrain_tree = "(%s, (%s));" % (','.join(sorted(
            task.out_seqs)), ','.join(sorted(task.target_seqs)))
        _outs = "\n".join([">%s\n0" % name for name in sorted(task.out_seqs)])
        _tars = "\n".join(
            [">%s\n1" % name for name in sorted(task.target_seqs)])
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit()  # since the creation of some Task
        # objects may require this info, I need
        # to commit right now.

        # Register node
        db.add_node(task.threadid, task.nodeid, task.cladeid, task.target_seqs,
                    task.out_seqs)

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = task.target_seqs
        nodeid2info[nodeid]["out_seqs"] = task.out_seqs
        alg_task = alignerclass(nodeid, task.multiseq_file, seqtype, conf,
                                alignerconf)
        alg_task.size = task.size
        new_tasks.append(alg_task)

    elif ttype == "alg" or ttype == "acleaner":
        if ttype == "alg":
            nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file
        elif ttype == "acleaner":
            nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file

        alg_fasta_file = getattr(task, "clean_alg_fasta_file",
                                 task.alg_fasta_file)
        alg_phylip_file = getattr(task, "clean_alg_phylip_file",
                                  task.alg_phylip_file)

        # Calculate alignment stats
        # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file,
        #                                        conf["app"]["trimal"])
        #
        # max_identity = get_trimal_identity(task.alg_fasta_file,
        #                                 conf["app"]["trimal"])
        # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std)
        # log.info("Max. Identity: %0.2f", max_identity)
        #import time
        #t1 = time.time()
        #mx, mn, mean, std = get_identity(task.alg_fasta_file)
        #print time.time()-t1
        #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        #t1 = time.time()

        if seqtype == "aa" and npr_conf.switch_aa_similarity < 1:
            try:
                alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats)
            except Exception as e:
                alg_stats = {}

            if ttype == "alg":
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                dataid = DATATYPES.alg_phylip
            elif ttype == "acleaner":
                algfile = pjoin(GLOBALS["input_dir"],
                                task.clean_alg_phylip_file)
                dataid = DATATYPES.clean_alg_phylip

            if "i_mean" not in alg_stats:
                log.log(24, "Calculating alignment stats...")
                # dump data if necesary
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                if not pexist(algfile):
                    # dump phylip alg
                    open(algfile,
                         "w").write(db.get_data(db.get_dataid(taskid, dataid)))

                mx, mn, mean, std = get_statal_identity(
                    algfile, conf["app"]["statal"])
                alg_stats = {
                    "i_max": mx,
                    "i_mean": mean,
                    "i_min": mn,
                    "i_std": std
                }
                db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats)

            log.log(22, "Alignment stats (sequence similarity):")
            log.log(
                22,
                "   max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f"
                % (alg_stats))

        else:
            alg_stats = {"i_max": -1, "i_mean": -1, "i_min": -1, "i_std": -1}

        #print time.time()-t1
        #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        task.max_ident = alg_stats["i_max"]
        task.min_ident = alg_stats["i_min"]
        task.mean_ident = alg_stats["i_mean"]
        task.std_ident = alg_stats["i_std"]
        next_task = None

        if ttype == "alg" and cleanerclass:
            next_task = cleanerclass(nodeid, seqtype, alg_fasta_file,
                                     alg_phylip_file, conf, cleanerconf)
        else:
            # Converts aa alignment into nt if necessary
            if  seqtype == "aa" and \
                    "nt" in GLOBALS["seqtypes"] and \
                    task.mean_ident >= npr_conf.switch_aa_similarity:
                log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\
                        (task.mean_ident, npr_conf.switch_aa_similarity))
                alg_fasta_file = "%s.%s" % (taskid, DATATYPES.alg_nt_fasta)
                alg_phylip_file = "%s.%s" % (taskid, DATATYPES.alg_nt_phylip)
                try:
                    alg_fasta_file = db.get_dataid(taskid,
                                                   DATATYPES.alg_nt_fasta)
                    alg_fasta_file = db.get_dataid(taskid,
                                                   DATATYPES.alg_nt_phylip)
                except ValueError:
                    log.log(22, "Calculating codon alignment...")

                    source_alg = pjoin(GLOBALS["input_dir"],
                                       task.alg_fasta_file)
                    if ttype == "alg":
                        kept_columns = []
                    elif ttype == "acleaner":
                        # if original alignment was trimmed, use it as reference
                        # but make the nt alignment only on the kept columns
                        kept_columns = db.get_task_data(
                            taskid, DATATYPES.kept_alg_columns)

                    if not pexist(source_alg):
                        open(source_alg, "w").write(
                            db.get_task_data(taskid, DATATYPES.alg_fasta))

                    nt_alg = switch_to_codon(source_alg,
                                             kept_columns=kept_columns)
                    db.add_task_data(taskid, DATATYPES.alg_nt_fasta,
                                     nt_alg.write())
                    db.add_task_data(taskid, DATATYPES.alg_nt_phylip,
                                     nt_alg.write(format='iphylip_relaxed'))

                npr_conf = IterConfig(conf, wkname, task.size, "nt")
                seqtype = "nt"

            if mtesterclass:
                next_task = mtesterclass(nodeid, alg_fasta_file,
                                         alg_phylip_file, constrain_id, conf,
                                         mtesterconf)
            elif treebuilderclass:
                next_task = treebuilderclass(nodeid, alg_phylip_file,
                                             constrain_id, None, seqtype, conf,
                                             treebuilderconf)
        if next_task:
            next_task.size = task.size
            new_tasks.append(next_task)

    elif ttype == "mchooser":
        if treebuilderclass:
            alg_fasta_file = task.alg_fasta_file
            alg_phylip_file = task.alg_phylip_file
            model = task.best_model
            tree_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id,
                                         model, seqtype, conf, treebuilderconf)
            tree_task.size = task.size
            new_tasks.append(tree_task)

    elif ttype == "tree":
        treemerge_task = splitterclass(nodeid, seqtype, task.tree_file, conf,
                                       splitterconf)
        #if conf["tree_splitter"]["_outgroup_size"]:
        #    treemerge_task = TreeSplitterWithOutgroups(nodeid, seqtype, task.tree_file, main_tree, conf)
        #else:
        #    treemerge_task = TreeSplitter(nodeid, seqtype, task.tree_file, main_tree, conf)

        treemerge_task.size = task.size
        new_tasks.append(treemerge_task)

    elif ttype == "treemerger":
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid,
                       runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass,
                          DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree
                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(target_seqs), len(out_seqs))
                alg_path = node_info.get("clean_alg_path",
                                         node_info["alg_path"])
                for node, seqs, outs, wkname in get_next_npr_node(
                        threadid, ttree, task.out_seqs, mtree, alg_path,
                        npr_conf):
                    log.log(24, "Registering new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = Msf(seqs, outs, seqtype=source_seqtype)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
    return new_tasks
Пример #8
0
 def load_stored_data(self):
     self.cogs = db.get_task_data(self.taskid, DATATYPES.cogs)
     self.cog_analysis = db.get_task_data(self.taskid,
                                          DATATYPES.cog_analysis)
Пример #9
0
 def load_stored_data(self):
     # self.tree_file = db.get_dataid(self.taskid, DATATYPES.tree)
     self.stats = db.get_task_data(self.taskid, DATATYPES.tree_stats)
Пример #10
0
 def load_stored_data(self):
     self.best_model = db.get_task_data(self.taskid, DATATYPES.best_model)
     self.model_ranking = db.get_task_data(self.taskid, DATATYPES.model_ranking)
Пример #11
0
 def load_stored_data(self):
     self.kept_columns[:] = [] # clear list
     self.kept_columns.append(
         db.get_task_data(self.taskid, DATATYPES.kept_alg_columns))
Пример #12
0
def split_tree(task_tree_node, task_outgroups, main_tree, alg_path, npr_conf, threadid, target_cladeids):
    """Browses a task tree from root to leaves and yields next
    suitable nodes for NPR iterations. Each yielded node comes with
    the set of target and outgroup tips.
    """

    def processable_node(_n):
        """This an internal function that returns true if a given node
        is suitable for a NPR iteration. It can be used as
        "is_leaf_fn" when traversing a tree.

        Note that this function uses several variables which change within the
        split_tree function, so must be kept within its namespace.

        """
        is_leaf = False
        for wkname, wkfilter in npr_conf.npr_workflows:
            # if node is not in the targets or does not meet size filters, skip
            # workflow
            if (
                _n is master_node
                or (_TARGET_NODES and _n not in _TARGET_NODES)
                or (target_cladeids and _n.cladeid not in target_cladeids)
                or len(n2content[_n]) < max(wkfilter.get("min_size", 3), 3)
                or ("max_size" in wkfilter and len(n2content[_n]) > wkfilter["max_size"])
            ):
                continue

            # If seq_sim filter used, calculate node stats
            if ALG and ("min_seq_sim" in wkfilter or "max_seq_sim" in wkfilter):
                if not hasattr(_n, "seqs_mean_ident"):
                    log.log(20, "Calculating node sequence stats...")
                    mx, mn, avg, std = get_seqs_identity(ALG, [__n.name for __n in n2content[_n]])
                    _n.add_features(seqs_max_ident=mx, seqs_min_ident=mn, seqs_mean_ident=avg, seqs_std_ident=std)
                    log.log(20, "mx=%s, mn=%s, avg=%s, std=%s" % (mx, mn, avg, std))

                if _n.seqs_mean_ident < wkfilter["min_seq_sim"]:
                    continue

                if _n.seqs_mean_ident > wkfilter["max_seq_sim"]:
                    continue

            else:
                _n.add_features(seqs_max_ident=None, seqs_min_ident=None, seqs_mean_ident=None, seqs_std_ident=None)

            if "min_support" in wkfilter:
                # If we are optimizing only lowly supported nodes, and nodes are
                # optimized without an outgroup, our target node is actually the
                # parent of lowly supported nodes. Therefore, I check if support
                # is low in children nodes, and return this node if so.
                if not npr_conf.use_outgroup:
                    if not [_ch for _ch in _n.children if _ch.support <= wkfilter["min_support"]]:
                        continue
                # Otherwise, just skip the node if it above the min support
                elif _n.support > wkfilter["min_support"]:
                    continue

            # At this point, node passed all filters of this workflow were met,
            # so it can be optimized
            is_leaf = True
            _n._target_wkname = wkname
            break

        return is_leaf

    log.log(20, "Loading tree content...")
    n2content = main_tree.get_cached_content()
    if alg_path:
        log.log(20, "Loading associated alignment to check seq. similarity")
        raw_alg = db.get_task_data(*alg_path.split("."))
        ALG = SeqGroup(raw_alg)
    else:
        ALG = None

    log.log(20, "Finding next NPR nodes...")
    # task_tree_node is actually a node in main_tree, since it has been
    # already merged
    trees_to_browse = [task_tree_node]
    npr_nodes = 0
    # loads current tree content, so we can check not reconstructing exactly the
    # same tree
    tasktree_content = set([leaf.name for leaf in n2content[task_tree_node]]) | set(task_outgroups)
    while trees_to_browse:
        master_node = trees_to_browse.pop()

        # if custom taxa levels are defined as targets, find them in this
        # subtree
        _TARGET_NODES = defaultdict(list)  # this container is used by
        # processable_node function
        opt_levels = GLOBALS[threadid].get("_optimized_levels", None)
        if opt_levels is not None:
            # any descendant of the already processed node is suitable for
            # selection. If the ancestor of level-species is on top of the
            # task_tree_node, it will be discarded
            avail_nodes = set(master_node.get_descendants())
            for lin in opt_levels:
                sp2lin, lin2sp = GLOBALS["lineages"]
                optimized, strict_monophyly = opt_levels[lin]
                if not optimized:
                    ancestor = main_tree.get_common_ancestor(*lin2sp[lin])
                    if ancestor in avail_nodes:
                        # check that the node satisfies level monophyly config
                        ancestor_content = set([x.name for x in n2content[ancestor]])
                        if not strict_monophyly or lin2sp[lin] == ancestor_content:
                            _TARGET_NODES[ancestor].append(lin)
                        elif strict_monophyly:
                            log.log(26, "Discarding not monophyletic level @@11:%s@@1:" % lin)
                    else:
                        log.log(26, "Discarding upper clade @@11:%s@@1:" % lin)

        for node in master_node.iter_leaves(is_leaf_fn=processable_node):
            if opt_levels:
                log.log(28, "Trying to optimizing custom tree level: @@11:%s@@1:" % _TARGET_NODES[node])
                for lin in _TARGET_NODES[node]:
                    # Marks the level as optimized, so is not computed again
                    opt_levels[lin][0] = True

            log.log(
                28, "Found possible target node of size %s branch support %f" % (len(n2content[node]), node.support)
            )
            log.log(28, "First suitable workflow: %s" % (node._target_wkname))

            # Finds best outgroup for the target node
            if npr_conf.use_outgroup:
                splitterconfname, _ = npr_conf.tree_splitter
                splitterconf = GLOBALS[threadid][splitterconfname]
                # seqs, outs = select_outgroups(node, n2content, splitterconf)
                # seqs, outs = select_closest_outgroup(node, n2content, splitterconf)
                seqs, outs = select_sister_outgroup(node, n2content, splitterconf)
            else:
                seqs = set([_i.name for _i in n2content[node]])
                outs = set()

            if seqs | outs == tasktree_content:
                log.log(
                    26, "Discarding target node of size %s, due to identity with its parent node" % len(n2content[node])
                )
                # print tasktree_content
                # print seqs
                # print outs
                trees_to_browse.append(node)
            else:
                npr_nodes += 1
                yield node, seqs, outs, node._target_wkname
    log.log(28, "%s nodes will be optimized", npr_nodes)