示例#1
0
def genNamesDict(resolver, logger, parentid=None):
    """Return a dictionary containing all names and metadata"""
    # extract lists from resolver
    qnames = resolver.retrieve('query_name')
    qnames = [re.sub("\s", "_", e) for e in qnames]  # no spaces
    ranks = resolver.retrieve('classification_path_ranks')
    # in order to get name, ID and rank for the context let's use a zipped
    #  lineage IDs
    lids = resolver.retrieve('classification_path_ids')
    lineages = zip(resolver.retrieve('classification_path'), lids,
                   resolver.retrieve('classification_path_ranks'))
    lineages = [zip(e1, e2, e3) for e1, e2, e3 in lineages]
    # make taxdict
    taxdict = TaxDict(idents=qnames, ranks=ranks, lineages=lineages)
    # get all ranks
    allrankids = []
    [allrankids.extend(e) for e in lids]
    allrankids = [int(e) for e in allrankids]  # make sure ints
    allrankids = list(set(allrankids))
    # init namesdict
    namesdict = {}
    # loop through taxdict
    for key in taxdict.keys():
        cident = taxdict[key]['cident']  # Contextual data
        if cident:
            # if there is context data ....
            cname, cident, crank = cident  # unpack context name, ID and rank
            namesdict[key] = {"txids": [int(cident)], "unique_name": cname,
                              "rank": crank}
        else:
            # find unclaimed IDs using allrankids and searching children
            rident = taxdict[key]['ident']  # Resolved ID
            rank = taxdict[key]['rank']  # Resolved rank
            # find ids in the next level
            children = etools.findChildren(rident, logger=logger,
                                           next=True)
            if children:
                unclaimed = [int(e) for e in children]
                unclaimed = [e for e in unclaimed if e not in allrankids]
                # choose random subset of unclaimed
                if len(unclaimed) > 5:
                    txids = random.sample(unclaimed, 5)
                # if there are no unclaimed, just use children
                if not unclaimed:
                    txids = children
            if 'txids' not in locals():
                # if no unclaimed children, just use rident
                txids = [str(rident)]
            namesdict[key] = {"txids": txids,
                              "unique_name": 'Non-unique resolution',
                              "rank": rank}
    # if no parent id given, work one out
    if not parentid:
        shared_bool = []
        for each in lineages[0]:
            shared_bool.append(all([each in e for e in lineages]))
        parentid = lineages[0][shared_bool.index(False) - 1]
        parentid = int(parentid[1])  # second one in tuple
    return namesdict, allrankids, parentid
示例#2
0
def getOutgroup(namesdict, parentid, logger, outgroupid=None, minrecords=1000):
    """Return namesdict with suitable outgroup"""
    # TODO: too complex, consider breaking up
    def findParent(parentid):
        return etools.eFetch(parentid, logger=logger,
                             db="taxonomy")[0]['ParentTaxId']

    def getTaxIdMetaData(ncbi_id):
        etal_bool = False
        if len(ncbi_id) > 1:
            ncbi_id = ncbi_id[0]
            etal_bool = True
        record = etools.eFetch(ncbi_id, logger=logger, db="taxonomy")[0]
        metadata = [record['Rank'], record['ScientificName']]
        if etal_bool:
            metadata = [e + ' et al.' for e in metadata]
        return metadata[0], metadata[1]
    # loop until a suitable outgroup is found. Criteria are:
    #  1. ids returned must belong to a sister group of all ids of
    #   names given
    #  2. ids must have nucleotide data (i.e.avoid returning extinct organisms)
    # assumptions:
    #  1. NCBI taxonomy is not paraphyletic
    # make sure parentid is string
    if not outgroupid:
        parentid = str(parentid)
        outgroup_ids = []
        while not outgroup_ids:
            # if parent id are Cellular Orgs, likely name resolution error
            #  or names given are too diverse
            if parentid == '131567':
                raise TaxonomicRankError()
            # get parent of parent
            grandparentid = findParent(parentid)
            # find all children
            candidates = etools.findChildren(grandparentid, logger=logger,
                                             next=True)
            # filter out children that are in ingroup
            candidates = [e for e in candidates if e != parentid]
            # search genbank for nuc records
            for candidate in candidates:
                term = 'txid' + str(candidate) + '[PORGN]'
                nuc_record = etools.eSearch(term, logger=logger)
                # there must be more than 1000 nuc records
                if int(nuc_record['Count']) > minrecords:
                    outgroup_ids.append(candidate)
            # make grandparentid the new parentid
            parentid = grandparentid
    else:
        outgroup_ids = [outgroupid]
    # add outgroup_ids to namesdict
    rank, unique_name = getTaxIdMetaData(outgroup_ids)
    # convert to ints
    outgroup_ids = [int(e) for e in outgroup_ids]
    namesdict["outgroup"] = {"txids": outgroup_ids, "unique_name": unique_name,
                             "rank": rank}
    return namesdict