예제 #1
0
    def finish(self):
        tm_start = time.ctime()
        all_species = self.targets | self.outgroups
        cogs, cog_analysis = brh_cogs2(db,
                                       all_species,
                                       missing_factor=self.missing_factor,
                                       seed_sp=self.seed)
        self.raw_cogs = cogs
        self.cog_analysis = cog_analysis
        self.cogs = []
        for co in cogs:
            # self.cogs.append(map(encode_seqname, co))
            encoded_names = db.translate_names(co)
            if len(encoded_names) != len(co):
                print(set(co) - set(encoded_names.keys()))
                raise DataError("Some sequence ids could not be translated")
            self.cogs.append(list(encoded_names.values()))

        # Sort Cogs according to the md5 hash of its content. Random
        # sorting but kept among runs
        list(map(lambda x: x.sort(), self.cogs))
        self.cogs.sort(lambda x, y: cmp(md5(','.join(x)), md5(','.join(y))))
        log.log(28, "%s COGs detected" % len(self.cogs))
        tm_end = time.ctime()
        #open(pjoin(self.taskdir, "__time__"), "w").write(
        #    '\n'.join([tm_start, tm_end]))
        CogSelectorTask.store_data(self, self.cogs, self.cog_analysis)
예제 #2
0
파일: cog_creator.py 프로젝트: Ward9250/ete
    def finish(self):
        tm_start = time.ctime()
        all_species = self.targets | self.outgroups
        cogs, cog_analysis = brh_cogs2(db, all_species,
                                      missing_factor=self.missing_factor,
                                      seed_sp=self.seed)
        self.raw_cogs = cogs
        self.cog_analysis = cog_analysis
        self.cogs = []
        for co in cogs:
            # self.cogs.append(map(encode_seqname, co))
            encoded_names = db.translate_names(co)
            if len(encoded_names) != len(co):
                print(set(co) - set(encoded_names.keys()))
                raise DataError("Some sequence ids could not be translated")
            self.cogs.append(list(encoded_names.values()))

        # Sort Cogs according to the md5 hash of its content. Random
        # sorting but kept among runs
        list(map(lambda x: x.sort(), self.cogs))
        self.cogs.sort(lambda x,y: cmp(md5(','.join(x)), md5(','.join(y))))
        log.log(28, "%s COGs detected" %len(self.cogs))
        tm_end = time.ctime()
        #open(pjoin(self.taskdir, "__time__"), "w").write(
        #    '\n'.join([tm_start, tm_end]))
        CogSelectorTask.store_data(self, self.cogs, self.cog_analysis)
예제 #3
0
파일: master_task.py 프로젝트: Ward9250/ete
    def load_task_info(self):
        ''' Initialize task information. It generates a unique taskID based on
        the sibling jobs and sets task working directory.'''

        # Creates a task id based on its target node and job arguments. The same
        # tasks, including the same parameters would raise the same id, so it is
        # easy to check if a task is already done in the working path.
        if not self.taskid:
            args_id = md5(','.join(sorted(["%s %s" %(str(pair[0]), str(pair[1]))
                                           for pair in six.iteritems(self.args)])))

            unique_id = md5(','.join([self.nodeid, self._config_id, args_id] +\
                                         sorted([getattr(j, "jobid", "taskid")
                                                 for j in self.jobs])))
            self.taskid = unique_id
예제 #4
0
파일: concat_alg.py 프로젝트: Ward9250/ete
    def __init__(self, cogs, seqtype, conf, confname, workflow_checksum):
        self.confname = confname
        self.conf = conf
        #self.cogs_hard_limit = int(conf[confname]["_max_cogs"])
        #used_cogs = cogs[:self.cogs_hard_limit]
        used_cogs = cogs

        cog_string = '#'.join([','.join(sorted(c)) for c in used_cogs])
        cog_keyid = md5(cog_string) # This will be nodeid
        base_args = {}
        ConcatAlgTask.__init__(self, cog_keyid, "concat_alg", "ConcatAlg",
                               workflow_checksum=workflow_checksum,
                               base_args=base_args, extra_args=conf[confname])
        self.avail_cogs = len(cogs)
        self.used_cogs = len(used_cogs)
        self.cogs = used_cogs
        self.seqtype = seqtype
        self.cog_ids = set()

        self.job2alg = {}
        self.job2model = {}
        if seqtype == "aa":
            self.default_model = conf[confname]["_default_aa_model"]
        elif seqtype == "nt":
            self.default_model = conf[confname]["_default_nt_model"]

        self.genetree_workflow = conf[confname]["_workflow"][1:]
        self.init()
예제 #5
0
    def __init__(self, cogs, seqtype, conf, confname, workflow_checksum):
        self.confname = confname
        self.conf = conf
        #self.cogs_hard_limit = int(conf[confname]["_max_cogs"])
        #used_cogs = cogs[:self.cogs_hard_limit]
        used_cogs = cogs

        cog_string = '#'.join([','.join(sorted(c)) for c in used_cogs])
        cog_keyid = md5(cog_string)  # This will be nodeid
        base_args = {}
        ConcatAlgTask.__init__(self,
                               cog_keyid,
                               "concat_alg",
                               "ConcatAlg",
                               workflow_checksum=workflow_checksum,
                               base_args=base_args,
                               extra_args=conf[confname])
        self.avail_cogs = len(cogs)
        self.used_cogs = len(used_cogs)
        self.cogs = used_cogs
        self.seqtype = seqtype
        self.cog_ids = set()

        self.job2alg = {}
        self.job2model = {}
        if seqtype == "aa":
            self.default_model = conf[confname]["_default_aa_model"]
        elif seqtype == "nt":
            self.default_model = conf[confname]["_default_nt_model"]

        self.genetree_workflow = conf[confname]["_workflow"][1:]
        self.init()
예제 #6
0
    def load_task_info(self):
        ''' Initialize task information. It generates a unique taskID based on
        the sibling jobs and sets task working directory.'''

        # Creates a task id based on its target node and job arguments. The same
        # tasks, including the same parameters would raise the same id, so it is
        # easy to check if a task is already done in the working path.
        if not self.taskid:
            args_id = md5(','.join(
                sorted([
                    "%s %s" % (str(pair[0]), str(pair[1]))
                    for pair in six.iteritems(self.args)
                ])))

            unique_id = md5(','.join([self.nodeid, self._config_id, args_id] +\
                                         sorted([getattr(j, "jobid", "taskid")
                                                 for j in self.jobs])))
            self.taskid = unique_id
예제 #7
0
    def __init__(self,
                 nodeid,
                 task_type,
                 task_name,
                 base_args=None,
                 extra_args=None):

        if not base_args: base_args = {}
        if not extra_args: extra_args = {}

        self.taskid = None

        # This define which task-processor should be used
        # (i.e. genetree, sptree).
        self.task_processor = None

        # Nodeid is used to identify the tree node associated with
        # the task. It is calculated as a hash string based on the
        # list of sequence IDs grouped by the node.
        self.nodeid = nodeid

        # task type: "alg|tree|acleaner|mchooser|etc."
        self.ttype = task_type

        # Used only to name directories and identify task in log
        # messages
        self.tname = task_name

        # Path to the file containing task status: (D)one, (R)unning
        # or (W)aiting or (Un)Finished
        #self.status_file = None
        #self.inkey_file = None
        #self.info_file = None
        self.status = "W"
        self.all_status = None

        # keeps a counter of how many cores are being used by running jobs
        self.cores_used = 0

        self.job_status = {}

        # Set arguments that could be sent to jobs
        self.args = merge_arg_dicts(extra_args, base_args, parent=self)

        # extract all internal config values associated to this task
        # and generate its unique id (later used to generate taskid)
        self._config_id = md5(','.join(
            sorted([
                "%s %s" % (str(pair[0]), str(pair[1]))
                for pair in six.iteritems(extra_args)
                if pair[0].startswith("_")
            ])))
        self.dependencies = set()
예제 #8
0
def add_task_data(taskid, datatype, data, duplicates="OR IGNORE"):
    data_id = md5(str(data))
    cmd = """ INSERT %s INTO task (taskid, status) VALUES
    ("%s", "D") """ % (duplicates, taskid)
    datacursor.execute(cmd)

    cmd = """ INSERT %s INTO task2data (taskid, datatype, md5) VALUES
    ("%s", "%s", "%s") """ % (duplicates, taskid, datatype, data_id)
    datacursor.execute(cmd)
    cmd = """ INSERT %s INTO data (md5, data) VALUES
    ("%s", "%s") """ % (duplicates, data_id, zencode(data, data_id))
    datacursor.execute(cmd)
    autocommit()
    return data_id
예제 #9
0
파일: db.py 프로젝트: Ward9250/ete
def add_task_data(taskid, datatype, data, duplicates="OR IGNORE"):
    data_id = md5(str(data))
    cmd = """ INSERT %s INTO task (taskid, status) VALUES
    ("%s", "D") """ %(duplicates, taskid)
    datacursor.execute(cmd)

    cmd = """ INSERT %s INTO task2data (taskid, datatype, md5) VALUES
    ("%s", "%s", "%s") """ %(duplicates, taskid, datatype, data_id)
    datacursor.execute(cmd)
    cmd = """ INSERT %s INTO data (md5, data) VALUES
    ("%s", "%s") """ %(duplicates, data_id, zencode(data, data_id))
    datacursor.execute(cmd)
    autocommit()
    return data_id
예제 #10
0
파일: master_job.py 프로젝트: Ward9250/ete
    def __init__(self, bin, args, jobname=None, parent_ids=None):
        # Used at execution time
        self.status = None
        # How to run the app
        self.bin = bin
        # command line arguments
        self.args = args
        # Default number of cores used by the job. If more than 1,
        # this attribute should be changed
        self.cores = 1
        self.exec_type = "insitu"
        self.jobname = jobname

        # generates the unique job identifier based on the params of
        # the app. Some params include path names that can prevent
        # recycling the job, so a clean it.
        clean = lambda x: basename(x) if GLOBALS["basedir"] in x or GLOBALS["tasks_dir"] in x else x
        parsed_id_string = ["%s %s" %(clean(str(pair[0])), clean(str(pair[1])))
                            for pair in six.iteritems(self.args)]
        #print '\n'.join(map(str, self.args.items()))

        self.jobid = md5(','.join(sorted([md5(e) for e in
                                          parsed_id_string])))
        # self.jobid = md5(','.join(sorted([md5(str(pair)) for pair in
        #                                  self.args.iteritems()])))
        if parent_ids:
            self.jobid = md5(','.join(sorted(parent_ids+[self.jobid])))

        if not self.jobname:
            self.jobname = re.sub("[^0-9a-zA-Z]", "-", basename(self.bin))

        self.ifdone_cmd = ""
        self.iffail_cmd = ""
        self.set_jobdir(pjoin(GLOBALS["tasks_dir"], self.jobid))
        self.input_files = {}
        self.dependencies = set()
예제 #11
0
파일: master_task.py 프로젝트: Ward9250/ete
    def __init__(self, nodeid, task_type, task_name, base_args=None,
                 extra_args=None):

        if not base_args: base_args = {}
        if not extra_args: extra_args = {}

        self.taskid = None

        # This define which task-processor should be used
        # (i.e. genetree, sptree).
        self.task_processor = None

        # Nodeid is used to identify the tree node associated with
        # the task. It is calculated as a hash string based on the
        # list of sequence IDs grouped by the node.
        self.nodeid = nodeid

        # task type: "alg|tree|acleaner|mchooser|etc."
        self.ttype = task_type

        # Used only to name directories and identify task in log
        # messages
        self.tname = task_name

        # Path to the file containing task status: (D)one, (R)unning
        # or (W)aiting or (Un)Finished
        #self.status_file = None
        #self.inkey_file = None
        #self.info_file = None
        self.status = "W"
        self.all_status = None

        # keeps a counter of how many cores are being used by running jobs
        self.cores_used = 0

        self.job_status = {}

        # Set arguments that could be sent to jobs
        self.args = merge_arg_dicts(extra_args, base_args, parent=self)

        # extract all internal config values associated to this task
        # and generate its unique id (later used to generate taskid)
        self._config_id = md5(','.join(sorted(["%s %s" %(str(pair[0]),str(pair[1])) for pair in
                                       six.iteritems(extra_args) if pair[0].startswith("_")])))
        self.dependencies = set()
예제 #12
0
def process_task(task, wkname, npr_conf, nodeid2info):
    cogconf, cogclass = npr_conf.cog_selector
    concatconf, concatclass = npr_conf.alg_concatenator
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    splitterconf, splitterclass = npr_conf.tree_splitter

    threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid,
                                        task.seqtype, task.ttype)
    cladeid, targets, outgroups = db.get_node_info(threadid, nodeid)

    if not treebuilderclass or task.size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree

    if outgroups and len(outgroups) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None

    node_info = nodeid2info[nodeid]
    conf = GLOBALS[task.configid]
    new_tasks = []
    if ttype == "cog_selector":

        # Generates a md5 id based on the genetree configuration workflow used
        # for the concat alg task. If something changes, concat alg will change
        # and the associated tree will be rebuilt
        config_blocks = set([wkname])
        for key, value in six.iteritems(conf[wkname]):
            if isinstance(value, list) or  isinstance(value, tuple) \
                    or isinstance(value, set):
                for elem in value:
                    config_blocks.add(elem[1:]) if isinstance(
                        elem, str) and elem.startswith("@") else None
            elif isinstance(value, str):
                config_blocks.add(value[1:]) if value.startswith("@") else None
        config_checksum = md5(''.join([
            "[%s]\n%s" % (x, dict_string(conf[x]))
            for x in sorted(config_blocks)
        ]))

        # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK
        # Check that current selection of cogs will cover all target and
        # outgroup species
        #cog_hard_limit = int(conf[concatconf]["_max_cogs"])
        #sp_repr = defaultdict(int)
        #for co in task.raw_cogs[:cog_hard_limit]:
        #    for sp, seq in co:
        #        sp_repr[sp] += 1
        #missing_sp = (targets | outgroups) - set(sp_repr.keys())
        #if missing_sp:
        #    raise TaskError("missing species under current cog selection: %s" %missing_sp)
        #else:
        #    log.log(28, "Analysis of current COG selection:")
        #    for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]):
        #        log.log(28, "   % 30s species present in % 6d COGs" %(sp, ncogs))

        # register concat alignment task. NodeId associated to concat_alg tasks
        # and all its children jobs should take into account cog information and
        # not only species and outgroups included.

        concat_job = concatclass(task.cogs, seqtype, conf, concatconf,
                                 config_checksum)
        db.add_node(threadid, concat_job.nodeid, cladeid, targets, outgroups)

        # Register Tree constrains
        constrain_tree = "(%s, (%s));" % (','.join(
            sorted(outgroups)), ','.join(sorted(targets)))
        _outs = "\n".join([">%s\n0" % name for name in sorted(outgroups)])
        _tars = "\n".join([">%s\n1" % name for name in sorted(targets)])
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree,
                         constrain_tree)
        db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg,
                         constrain_alg)
        db.dataconn.commit()  # since the creation of some Task objects
        # may require this info, I need to commit
        # right now.
        concat_job.size = task.size
        new_tasks.append(concat_job)

    elif ttype == "concat_alg":
        # register tree for concat alignment, using constraint tree if
        # necessary
        alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip)
        try:
            parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions)
        except ValueError:
            parts_id = None

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = targets
        nodeid2info[nodeid]["out_seqs"] = outgroups

        tree_task = treebuilderclass(nodeid,
                                     alg_id,
                                     constrain_id,
                                     None,
                                     seqtype,
                                     conf,
                                     treebuilderconf,
                                     parts_id=parts_id)
        tree_task.size = task.size
        new_tasks.append(tree_task)

    elif ttype == "tree":
        merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf,
                                    splitterconf)
        merger_task.size = task.size
        new_tasks.append(merger_task)

    elif ttype == "treemerger":
        # Lets merge with main tree
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid,
                       runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass,
                          DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree

                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(targets), len(outgroups))

                target_cladeids = None
                if tobool(conf[splitterconf].get("_find_ncbi_targets", False)):
                    tcopy = mtree.copy()
                    ncbi.connect_database()
                    tax2name, tax2track = ncbi.annotate_tree_with_taxa(
                        tcopy, None)
                    #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example
                    n2content = tcopy.get_cached_content()
                    broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches(
                        tcopy, n2content)
                    log.log(
                        28, 'restricting NPR to broken clades: ' + colorify(
                            ', '.join(
                                ["%s" % tax2name[x]
                                 for x in broken_clades]), "wr"))
                    target_cladeids = set()
                    for branch in broken_branches:
                        print(
                            branch.get_ascii(attributes=['spname', 'taxid'],
                                             compact=True))
                        print([
                            "%s" % tax2name[x] for x in broken_branches[branch]
                        ])
                        target_cladeids.add(branch.cladeid)

                for node, seqs, outs, wkname in get_next_npr_node(
                        task.configid, ttree, task.out_seqs, mtree, None,
                        npr_conf,
                        target_cladeids):  # None is to avoid alg checks
                    log.log(24, "Adding new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = cogclass(seqs, outs, source_seqtype, conf,
                                             cogconf)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
                    db.add_node(threadid, new_task_node.nodeid,
                                new_task_node.cladeid, new_task_node.targets,
                                new_task_node.outgroups)
    return new_tasks
예제 #13
0
파일: supermatrix.py 프로젝트: fmaguire/ete
def process_task(task, wkname, npr_conf, nodeid2info):
    cogconf, cogclass = npr_conf.cog_selector
    concatconf, concatclass = npr_conf.alg_concatenator
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    splitterconf, splitterclass = npr_conf.tree_splitter

    threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid,
                                        task.seqtype, task.ttype)
    cladeid, targets, outgroups = db.get_node_info(threadid, nodeid)

    if not treebuilderclass or task.size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree

    if outgroups and len(outgroups) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None

    node_info = nodeid2info[nodeid]
    conf = GLOBALS[task.configid]
    new_tasks = []
    if ttype == "cog_selector":

        # Generates a md5 id based on the genetree configuration workflow used
        # for the concat alg task. If something changes, concat alg will change
        # and the associated tree will be rebuilt
        config_blocks = set([wkname])
        for key, value in six.iteritems(conf[wkname]):
            if isinstance(value, list) or  isinstance(value, tuple) \
                    or isinstance(value, set):
                for elem in value:
                    config_blocks.add(elem[1:]) if isinstance(elem, str) and elem.startswith("@") else None
            elif isinstance(value, str):
                config_blocks.add(value[1:]) if value.startswith("@") else None
        config_checksum =  md5(''.join(["[%s]\n%s" %(x, dict_string(conf[x]))
                                        for x in sorted(config_blocks)]))

        # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK
        # Check that current selection of cogs will cover all target and
        # outgroup species
        #cog_hard_limit = int(conf[concatconf]["_max_cogs"])
        #sp_repr = defaultdict(int)
        #for co in task.raw_cogs[:cog_hard_limit]:
        #    for sp, seq in co:
        #        sp_repr[sp] += 1
        #missing_sp = (targets | outgroups) - set(sp_repr.keys())
        #if missing_sp:
        #    raise TaskError("missing species under current cog selection: %s" %missing_sp)
        #else:
        #    log.log(28, "Analysis of current COG selection:")
        #    for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]):
        #        log.log(28, "   % 30s species present in % 6d COGs" %(sp, ncogs))

        # register concat alignment task. NodeId associated to concat_alg tasks
        # and all its children jobs should take into account cog information and
        # not only species and outgroups included.

        concat_job = concatclass(task.cogs, seqtype, conf, concatconf,
                                 config_checksum)
        db.add_node(threadid,
                    concat_job.nodeid, cladeid,
                    targets, outgroups)

        # Register Tree constrains
        constrain_tree = "(%s, (%s));" %(','.join(sorted(outgroups)),
                                         ','.join(sorted(targets)))
        _outs = "\n".join([">%s\n0" %name for name in sorted(outgroups)])
        _tars = "\n".join([">%s\n1" %name for name in sorted(targets)])
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit() # since the creation of some Task objects
                             # may require this info, I need to commit
                             # right now.
        concat_job.size = task.size
        new_tasks.append(concat_job)

    elif ttype == "concat_alg":
        # register tree for concat alignment, using constraint tree if
        # necessary
        alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip)
        try:
            parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions)
        except ValueError:
            parts_id = None

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = targets
        nodeid2info[nodeid]["out_seqs"] = outgroups

        tree_task = treebuilderclass(nodeid, alg_id,
                                     constrain_id, None,
                                     seqtype, conf, treebuilderconf,
                                     parts_id=parts_id)
        tree_task.size = task.size
        new_tasks.append(tree_task)

    elif ttype == "tree":
        merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf)
        merger_task.size = task.size
        new_tasks.append(merger_task)

    elif ttype == "treemerger":
        # Lets merge with main tree
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid, runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree

                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(targets), len(outgroups))

                target_cladeids = None
                if tobool(conf[splitterconf].get("_find_ncbi_targets", False)):
                    tcopy = mtree.copy()
                    ncbi.connect_database()
                    tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, None)
                    #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example
                    n2content = tcopy.get_cached_content()
                    broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches(tcopy, n2content)
                    log.log(28, 'restricting NPR to broken clades: '+
                            colorify(', '.join(["%s"%tax2name[x] for x in broken_clades]), "wr"))
                    target_cladeids = set()
                    for branch in broken_branches:
                        print(branch.get_ascii(attributes=['spname', 'taxid'], compact=True))
                        print(["%s"%tax2name[x] for x in broken_branches[branch]])
                        target_cladeids.add(branch.cladeid)

                for node, seqs, outs, wkname in get_next_npr_node(task.configid, ttree,
                                                          task.out_seqs, mtree, None,
                                                          npr_conf, target_cladeids): # None is to avoid alg checks
                    log.log(24, "Adding new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = cogclass(seqs, outs,
                                             source_seqtype, conf, cogconf)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
                    db.add_node(threadid,
                                new_task_node.nodeid, new_task_node.cladeid,
                                new_task_node.targets,
                                new_task_node.outgroups)
    return new_tasks