def genNamesDict(resolver, logger, parentid=None): """Return a dictionary containing all names and metadata""" # extract lists from resolver qnames = resolver.retrieve('query_name') qnames = [re.sub("\s", "_", e) for e in qnames] # no spaces ranks = resolver.retrieve('classification_path_ranks') # in order to get name, ID and rank for the context let's use a zipped # lineage IDs lids = resolver.retrieve('classification_path_ids') lineages = zip(resolver.retrieve('classification_path'), lids, resolver.retrieve('classification_path_ranks')) lineages = [zip(e1, e2, e3) for e1, e2, e3 in lineages] # make taxdict taxdict = TaxDict(idents=qnames, ranks=ranks, lineages=lineages) # get all ranks allrankids = [] [allrankids.extend(e) for e in lids] allrankids = [int(e) for e in allrankids] # make sure ints allrankids = list(set(allrankids)) # init namesdict namesdict = {} # loop through taxdict for key in taxdict.keys(): cident = taxdict[key]['cident'] # Contextual data if cident: # if there is context data .... cname, cident, crank = cident # unpack context name, ID and rank namesdict[key] = {"txids": [int(cident)], "unique_name": cname, "rank": crank} else: # find unclaimed IDs using allrankids and searching children rident = taxdict[key]['ident'] # Resolved ID rank = taxdict[key]['rank'] # Resolved rank # find ids in the next level children = etools.findChildren(rident, logger=logger, next=True) if children: unclaimed = [int(e) for e in children] unclaimed = [e for e in unclaimed if e not in allrankids] # choose random subset of unclaimed if len(unclaimed) > 5: txids = random.sample(unclaimed, 5) # if there are no unclaimed, just use children if not unclaimed: txids = children if 'txids' not in locals(): # if no unclaimed children, just use rident txids = [str(rident)] namesdict[key] = {"txids": txids, "unique_name": 'Non-unique resolution', "rank": rank} # if no parent id given, work one out if not parentid: shared_bool = [] for each in lineages[0]: shared_bool.append(all([each in e for e in lineages])) parentid = lineages[0][shared_bool.index(False) - 1] parentid = int(parentid[1]) # second one in tuple return namesdict, allrankids, parentid
def getOutgroup(namesdict, parentid, logger, outgroupid=None, minrecords=1000): """Return namesdict with suitable outgroup""" # TODO: too complex, consider breaking up def findParent(parentid): return etools.eFetch(parentid, logger=logger, db="taxonomy")[0]['ParentTaxId'] def getTaxIdMetaData(ncbi_id): etal_bool = False if len(ncbi_id) > 1: ncbi_id = ncbi_id[0] etal_bool = True record = etools.eFetch(ncbi_id, logger=logger, db="taxonomy")[0] metadata = [record['Rank'], record['ScientificName']] if etal_bool: metadata = [e + ' et al.' for e in metadata] return metadata[0], metadata[1] # loop until a suitable outgroup is found. Criteria are: # 1. ids returned must belong to a sister group of all ids of # names given # 2. ids must have nucleotide data (i.e.avoid returning extinct organisms) # assumptions: # 1. NCBI taxonomy is not paraphyletic # make sure parentid is string if not outgroupid: parentid = str(parentid) outgroup_ids = [] while not outgroup_ids: # if parent id are Cellular Orgs, likely name resolution error # or names given are too diverse if parentid == '131567': raise TaxonomicRankError() # get parent of parent grandparentid = findParent(parentid) # find all children candidates = etools.findChildren(grandparentid, logger=logger, next=True) # filter out children that are in ingroup candidates = [e for e in candidates if e != parentid] # search genbank for nuc records for candidate in candidates: term = 'txid' + str(candidate) + '[PORGN]' nuc_record = etools.eSearch(term, logger=logger) # there must be more than 1000 nuc records if int(nuc_record['Count']) > minrecords: outgroup_ids.append(candidate) # make grandparentid the new parentid parentid = grandparentid else: outgroup_ids = [outgroupid] # add outgroup_ids to namesdict rank, unique_name = getTaxIdMetaData(outgroup_ids) # convert to ints outgroup_ids = [int(e) for e in outgroup_ids] namesdict["outgroup"] = {"txids": outgroup_ids, "unique_name": unique_name, "rank": rank} return namesdict