def ncbi_place(self, state): """ Given taxids we can find all reference species used to construct the backbone tree with overlapping taxonomy. Requires database to be loaded. :param taxids: List or set of taxids :return: dict with placements """ # parse, so we allow comma and spaces taxa = [] for t in self.state["taxids"]: if "," in t: taxa.extend(t.split(",")) else: taxa.append(t) # convert taxa in a set of strings taxa = set([str(t) for t in taxa]) info = load_tax_info(state["dbinfo"]["files"]["taxinfo"]) # find all nodes that intersect with the taxids nodes = set() for node, lng in info.items(): if len(taxa & set(lng)) > 0: nodes.add(node) # make sure these are also in our SCMG set scmgs = load_SCMGs(state["dbinfo"]["files"]["scmgs"]) nodes = nodes & set(scmgs.keys()) placements = [{"n": x} for x in nodes] logging.info( "Located {} species corresponding to the provided taxids".format( len(nodes))) return {"placements": placements, "genomes": nodes}
def test_scmg_loading_gz(self): expected = { "A": set(["1", "2"]), "B": set(["1", "3", "2"]), "C": set(["1"]) } found = load_SCMGs(TESTDATA_SCMG_GZ) self.assertEqual(expected, found)
def hard_set_computation(set_path, genomes, prevalence=98, atmost=500, set_size=20): """ Function to compute set based on a list of genomes passed to it """ scmg = load_SCMGs(set_path) found = False set_prevalence = 100 biggest = 0 while found is False and set_prevalence >= prevalence: logging.debug( "Searching for Marker set at {} prevalence across {} genomes". format(set_prevalence, len(genomes))) sets = [] for genome in genomes: try: sets.append(scmg[genome]) except KeyError: logging.warning( "Database missing markes for '{}'. This should not be the case. Make sure the database is not corrupted" .format(genome)) s = percentage_sets(sets, set_prevalence, atmost) if len(s) > biggest: biggest = len(s) if len(s) >= set_size: found = True break set_prevalence = set_prevalence - 0.5 logging.debug("Largest set we found had {} SCMGs".format(biggest)) if found: logging.debug("Found set of size {} with prevalence {}".format( len(s), set_prevalence)) return s else: return None
def __init__( self, tree_v, placement, setp, set_species=5, set_size=50, set_prevalence=98, set_atmost=500, dynamic_root=False, set_selection="lm", use_ncbi=False, training=False, taxinfo=None, ): self.t = Tree(tree_v) # find LCA of all placements # make places into convenient list pl = [x["n"] for x in placement["placements"]] places = [] for p in pl: if type(p) is list: places.extend(p) elif type(p) is str: places.append(p) # root the tree to get best clade patterns if dynamic_root: logging.debug("Will use most distant entry to LCA as outgroup") self.lca = self.LCA(places) new_root = self.lca.get_farthest_node() self.t.set_outgroup(new_root[0]) # load in all marker genes scmg = load_SCMGs(setp) self.known_leafes = set(load_tax_info(taxinfo).keys()) logging.debug( "Starting to look for scmg set, selection based on {}".format( set_selection)) if use_ncbi: logging.debug("Will use NCBI tree instead of eukcc tree") self.marker_set = self._find_best_ncbi_set( places, scmg, taxinfo=taxinfo, min_set_size=set_size, set_atmost=set_atmost, set_species=set_species, min_prevalence=set_prevalence, ) else: # expose final prevalence self.marker_set = self._find_best_set( places, scmg, training=training, min_set_size=set_size, set_atmost=set_atmost, set_species=set_species, min_prevalence=set_prevalence, sort_using=set_selection, ) if training is False and self.marker_set is not None: logging.debug( "Defined SCMG set with {} marker genes with a single copy prevalence of {} percent covering {} related genomes supported by {}/{} placements" .format( len(self.marker_set.profiles), self.marker_set.prevalence, len(self.marker_set.leafes), len(self.marker_set.covered), len(self.marker_set.all_places), ))
def test_scmg_loading_csv(self): csv = load_SCMGs(TESTDATA_SCMG) gz = load_SCMGs(TESTDATA_SCMG_GZ) self.assertEqual(csv, gz)
def test_scmg_missing(self): with self.assertRaises(FileNotFoundError): load_SCMGs("adfhjdshjskfjf")