def addDivisionColumn(clusterTable, outputName):

    # first get all the species (or genus, etc) ids
    ids = {}
    for line in open(clusterTable):
        name = line.split()[0]
        try:
            id = int(name.split("_")[0])
            ids[id] = 1
        except:
            print "Unable to get tax id for '%s'.  Skipping." % (name)

    # then look up the taxonomy for each id
    print "Looking up taxonomy records for each id"
    taxRecords = eutilsWrapper.getTaxa(ids.keys())

    # then go back through the file and print the new second column
    out = open(outputName, "w")
    count = 0
    print "Writing output file"
    for line in open(clusterTable):
        pieces = line.split()
        count += 1
        if count == 1:
            out.write("%s\t%s\t%s\n" %
                      (pieces[0], "Division", "\t".join(pieces[1:])))
            continue

        id = int(pieces[0].split("_")[0])
        try:
            division = taxRecords[id].getDivision()
        except:
            raise

        out.write("%s\t%s\t%s\n" %
                  (pieces[0], division, "\t".join(pieces[1:])))

    print "Wrote %s lines to the output file." % (count)
def addDivisionColumn(clusterTable, outputName):

    # first get all the species (or genus, etc) ids
    ids = {}
    for line in open(clusterTable):
        name = line.split()[0]
        try:
            id = int(name.split("_")[0])
            ids[id] = 1
        except:
            print "Unable to get tax id for '%s'.  Skipping." % (name)

    # then look up the taxonomy for each id
    print "Looking up taxonomy records for each id"
    taxRecords = eutilsWrapper.getTaxa(ids.keys())

    # then go back through the file and print the new second column
    out = open(outputName, "w")
    count = 0
    print "Writing output file"
    for line in open(clusterTable):
        pieces = line.split()        
        count += 1
        if count == 1:
            out.write("%s\t%s\t%s\n" % (pieces[0], "Division", "\t".join(pieces[1:])))
            continue

        id = int(pieces[0].split("_")[0])
        try:
            division = taxRecords[id].getDivision()
        except:
            raise
        
        out.write("%s\t%s\t%s\n" % (pieces[0], division, "\t".join(pieces[1:])))

    print "Wrote %s lines to the output file." % (count)
Пример #3
0
def reportTaxonomyJWC(blastInput,
                      printTargets=False,
                      outputPrefix=None,
                      dotPrefix=None,
                      dotLimit=1.0):
    """Given a blast input file, reports the taxonomy lineage for the hits.  By default the report
    will be printed to standard output, but if an outputPrefix is given then output files, one each
    for species, genus, family and division, will be created.
    The Blast input file should have already been run through a Best Blast filter if you want to only
    count unique hits.  Can have the extra counts column added, as per blastUtils.keepAllBestHits."""
    '''
    fungiLookup - dictionary mapping
        seqID (string) - the reference sequence name used for looking up blast hits
        to
        speciesID (string) - taxon ID number as text
    gis - dictionary mapping                (used as non-repeating list)
        gid (string) - the gid, the reference number used for looking up blast hits (as text)
        to 1
    accs - dictionary mapping                (used as non-repeating list)
        accesion numbers (without.version suffix) (string) - referenceces we are seeking to look up

    taxID_to_lookup - dictionary mapping    (used as non-repeating list)
        taxID (int) - taxon ID to lookup
        to 1
    taxonomy - dictionary mapping
        ###gid (string)
        accession number (without .version suffix) (string)
        to
        taxID (int)
    names
        taxID (int)
        to
        eutilsWrapper.Taxonomy object
    4tuple
        (float, int, int, float) representing (IMSA count, unique count, partial count, partial sum)

    taxaCount = {}
    speciesCount = {}
    genusCount = {}
    familyCount = {}
        map
        taxID (int)
        to
        4tuple

    lookups -- dictionary to store eUtil lookup results
        taxID (int)
        to
          _variable_               _class_
        (newTaxaID,             (int)
        single[newTaxaID])      eutilsWrapper.Taxonomy
    '''

    # STEP 110: build fungiLookup
    print >> sys.stderr, 'STEP 110: build fungiLookup'
    fungiLookup = {}  # lookup table from file
    print >> sys.stderr, '        : using lookup file ', fungiLookupFile
    for line in open(fungiLookupFile, 'r'):
        splits = line.split('\t')
        key = splits[0]
        speciesID = int(splits[1])
        fungiLookup[key] = speciesID

    # STEP 120:
    print >> sys.stderr, 'STEP 120: extract accession number targets, SID_to_lookup from file'

    #contains integer taxID to lookup in NCBI -- will contain FungiDB species and regular database taxID
    taxID_to_lookup = {}

    # dict containing gi's to lookup, requiring NCBI lookup
    #gis = {}   #removed for
    accs = {}

    #extractLookupInfo(gis, taxID_to_lookup, fungiLookup, blastInput)
    # gis is indexed by string numbers
    # taxID_to_lookup is indexed by integer numbers

    extractLookupInfo(accs, taxID_to_lookup, fungiLookup, blastInput)
    # gis is indexed by string numbers
    # taxID_to_lookup is indexed by integer numbers

    # STEP 130: write gis table to file
    print >> sys.stderr, 'STEP 130: write accession numbers table to file'
    outGis = open(outputPrefix + "ACCS.txt", "w")
    for key in accs:
        outGis.write("%s\n" % (key))
    outGis.close()

    # STEP 140: build taxonomy dictionary from gis -- parse local file
    print >> sys.stderr, 'STEP 140: build taxonomy dictionary from accs -- parse local file     LONG STEP'
    print >> sys.stderr, '       : checking for pickle'

    import cPickle as pickle
    if not os.path.exists(blastInput + ".taxonomy.pickle"):
        print >> sys.stderr, '       : no pickle to load'
        taxonomy = {}
        print >> sys.stderr, "       : Getting taxonomy ids from local file for %s accs" % (
            len(accs))

        #short circuit if no need to parse file
        if len(accs) > 0:
            print >> sys.stderr, '       :       : searching %s' % ACC_BLAST_TAX_DB
            for line in open(ACC_BLAST_TAX_DB):
                (acc, accv, taxid, gi) = line.split()
                if accs.has_key(acc):
                    taxonomy[acc] = int(taxid)
            print >> sys.stderr, '       :       : searching %s' % ACC_EXTRA_BLAST_TAX_DB
            for line in open(ACC_EXTRA_BLAST_TAX_DB):
                (acc, accv, taxid, gi) = line.split('\t')
                if accs.has_key(acc):
                    taxonomy[acc] = int(taxid)

        print >> sys.stderr, '       : creating pickle on disk'
        pickle.dump(taxonomy, open(blastInput + ".taxonomy.pickle", "wb"))
    else:
        print >> sys.stderr, '       : loading pickle'
        taxonomy = pickle.load(open(blastInput + ".taxonomy.pickle", "rb"))

    print >> sys.stderr, "       : Done getting taxonomy ids, got %s ids" % (
        len(taxonomy))

    # STEP 145: write Alignments to gi nubmers, which were not found
    print >> sys.stderr, 'STEP 145: write Alignments to accession numbers, which were not found'
    findingUnknownAlignments(taxonomy, blastInput,
                             outputPrefix + "unidentifiedTaxaAlignments.txt",
                             fungiLookup)

    # STEP 150: build taxID_to_lookup from taxonomy
    # could be combined in step 140, but 1 pickle is enough
    print >> sys.stderr, 'STEP 150: increase taxID_to_lookup from taxonomy list'
    for k in taxonomy:
        taxID_to_lookup[taxonomy[k]] = 1  # writing integers as keys

    #gis contains gi keys as strings
    #taxonomy contains taxid (int) coded to gi keys (string)

    # STEP 160: get the scientific name for each taxa from DB
    print >> sys.stderr, 'STEP 160: get the scientific name for each taxa from DB'

    print >> sys.stderr, "       : Accessing NCBI for %d taxonomies" % len(
        taxID_to_lookup)
    #V2: getTaxa for GI alignments and the fungiDB alignments
    names = eutilsWrapper.getTaxa(taxID_to_lookup.keys())  #lists of integers
    print >> sys.stderr, "       : Got %s taxonomies from NCBI" % (len(names))

    # STEP 165: write names to file
    print >> sys.stderr, 'STEP 165: write names to file'
    OPP = open(outputPrefix + "NAMES.txt", "w")
    for name in names:
        OPP.write("\n" + str(name) + "\n" + str(names[name]))
    OPP.close()

    # STEP 168: output taxa reports (LCA binning version)
    print >> sys.stderr, 'STEP 168: import ncbi taxonomy database'
    print >> sys.stderr, '       : build taxaNames'
    taxaNames = buildNames()
    print >> sys.stderr, '       : build taxaNodes'
    (taxaNodes, levels) = buildNodes()

    # STEP 170: build allHits from blastInput: mapping query, to list of species ID that are hits

    print >> sys.stderr, 'STEP 170: build allHits from blastInput: mapping query, to list of species ID that are hits'

    currQuery = ""
    hitList = []
    first = True
    taxaCount = {}
    speciesCount = {}
    genusCount = {}
    familyCount = {}

    missing = open(outputPrefix + "unresolved_acc.txt", 'w')
    uniqLog = open(outputPrefix + "uniqueAlignments.txt", 'w')

    #temporary list to store values looked up via eUtils
    lookups = {}

    for line in open(blastInput, "r"):
        #loop control
        splits = line.split()

        if splits[0] != currQuery:
            #process stored output
            if first:
                first = False
                currQuery = splits[0]
            else:
                #process output stored in hitList
                processHitList(taxaCount, speciesCount, genusCount,
                               familyCount, hitList, currQuery, uniqLog)

                #reset loop variables
                hitList = []
                currQuery = splits[0]

        #analysis of line - do every time, even if find end
        k = splits[1]
        acc = parseAccession(k, fungiLookup)

        if acc == "0" and k in fungiLookup:  #gi not found, cuz new fungiDB reference sequence name
            (taxaID, speciesID, genusID,
             familyID) = lookupFungi(k, fungiLookup, names)
        elif acc in taxonomy:  #2015-07-24
            taxID = taxonomy[acc]
            #taxaID = taxID
            if taxID in names:
                fullTax = names[taxID]
                taxaID = taxID
                (speciesID, genusID, familyID) = fullTax2IDS(fullTax)
            elif taxID in lookups.keys():
                #new condition: if I looked up and got a merged entry, I saved it and this looks it up
                (taxID, fullTax) = lookups[taxID]
                (speciesID, genusID, familyID) = fullTax2IDS(fullTax)
                taxaID = taxID
            else:
                # attempt to lookup missing values
                print >> sys.stderr, "taxID %d was not found in the NCBI lookup; it is being looked up again, possible merged record" % taxID
                #print "taxID %d was not found in the NCBI lookup; it is being looked up again, possible merged record" % taxID
                single = eutilsWrapper.getTaxa([taxID])
                if len(single) == 1:
                    newTaxaID = single.keys()[0]
                    lookups[taxID] = (newTaxaID, single[newTaxaID])
                    print >> sys.stderr, "taxID %d was found in the NCBI lookup as taxID %d" % (
                        taxID, int(newTaxaID))
                    #print "taxID %d was found in the NCBI lookup as taxID %d" % (taxID, int(newTaxaID))
                    fullTax = single[newTaxaID]
                    (speciesID, genusID, familyID) = fullTax2IDS(fullTax)
                else:
                    print >> sys.stderr, "taxID %d was STILL not found in the NCBI lookup; it is being ignored" % taxID
                    missing.write("taxaID\t" + str(taxID) + "\n")
                    taxaID = -1
                    speciesID = -1
                    genusID = -1
                    familyID = -1
        else:
            missing.write("not in taxonomy DB\t" + line)
            taxaID = -1
            speciesID = -1
            genusID = -1
            familyID = -1
        #end if gi == 0:
        hitList.append((taxaID, speciesID, genusID, familyID))
    #end for
    missing.close()

    # STEP 180: output taxa reports (LCA binning version)
    print >> sys.stderr, 'STEP 180: output taxa reports (LCA binning version)'
    print >> sys.stderr, '       : writing reports'

    # 2016-12-08 need to save Kingdom information and pass forward
    savedKingdom = dict()

    filename = outputPrefix + "firstTaxon.IMSA+A_4count.txt"
    outFile = open(filename, "w")
    outFile.write("%s\t%s\t%s\n" % (
        "Taxa ID", "Scientific Name",
        "Clade Level\tTotal\tUnique clade hits\tPartial clade hits\tPartial clade sum"
    ))
    KEYS = taxaCount.keys()
    KEYS.sort()
    numFirstTaxon = 0
    for taxID in KEYS:
        kingdom = 1
        if taxID == -1 or taxID == "-1":
            sname = -1
            count = taxaCount[taxID]
        elif taxID in taxaNames:  #if found in NCBI local database at systemSettings.PATH
            sname = taxaNames[taxID]
            count = taxaCount[taxID]
            kingdom = findKingdom(taxID, taxaNodes)
        else:
            # debug 2016-01-15 added step to prevent unknown/merged record to taxa report
            print "*&*&* Calling eUtils with taxID: ", taxID
            single = eutilsWrapper.getTaxa([taxID])
            print "*&*&* result of eUtils : ", single
            print "*&*&* result of eUtils single[0] : ", single[single.keys()
                                                                [0]]
            if len(single) == 1:
                count = taxaCount[taxID]
                try:
                    ID = int(single[single.keys()[0]].getSpecies().taxId)
                except:
                    print "*&*& could not look up taxaID ", taxID
                    ID = int(taxID)
                taxID = ID
                #print "*&*&* fullTax is ", single[ single.keys()[0] ]
                print "*&*&* taxID is ", taxID
                #sname = taxaNames[ taxID ]     #bug 12/06/2016
                sname = single[single.keys()[0]].getSpecies().sname
                print "*&*&* sname is ", sname
                kingdom = findKingdom(
                    single[single.keys()[0]].getPhylum().taxId, taxaNodes)
                if single[single.keys()[0]].getSpecies():
                    savedKingdom[single[single.keys()
                                        [0]].getSpecies().taxId] = kingdom
                if single[single.keys()[0]].getGenus():
                    savedKingdom[single[single.keys()
                                        [0]].getGenus().taxId] = kingdom
                if single[single.keys()[0]].getFamily():
                    savedKingdom[single[single.keys()
                                        [0]].getFamily().taxId] = kingdom
            else:
                sname = -1
                count = taxaCount[taxID]

        if taxID in taxaNodes:
            level = taxaNodes[taxID][1]
        else:
            level = "unknown"
        #myLine = "%s\t%s\t%s\t%s\n" % (str(taxID), sname, level, catV(count))
        myLine = "%s\t%s\t%s\t%s\t%s\n" % (str(taxID), sname, count[1],
                                           taxaNames[kingdom], catV(count))
        if int(count[1]) > 0:
            numFirstTaxon += 1
        outFile.write(myLine)
    outFile.close()

    filename = outputPrefix + "species.IMSA+A_4count.txt"
    outFile = open(filename, "w")
    filename2 = outputPrefix + "species.IMSA_count.txt"
    outFile2 = open(filename2, "w")
    KEYS = speciesCount.keys()
    KEYS.sort()
    fillOutCountFiles(KEYS, speciesCount, taxaNames, taxaNodes, outFile,
                      outFile2, "Species", savedKingdom)
    outFile.close()
    outFile2.close()

    filename = outputPrefix + "genus.IMSA+A_4count.txt"
    outFile = open(filename, "w")
    filename2 = outputPrefix + "genus.IMSA_count.txt"
    outFile2 = open(filename2, "w")
    KEYS = genusCount.keys()
    KEYS.sort()
    fillOutCountFiles(KEYS, genusCount, taxaNames, taxaNodes, outFile,
                      outFile2, "Genus", savedKingdom)
    outFile.close()
    outFile2.close()

    filename = outputPrefix + "family.IMSA+A_4count.txt"
    outFile = open(filename, "w")
    filename2 = outputPrefix + "family.IMSA_count.txt"
    outFile2 = open(filename2, "w")

    KEYS = familyCount.keys()
    KEYS.sort()

    fillOutCountFiles(KEYS, familyCount, taxaNames, taxaNodes, outFile,
                      outFile2, "Family", savedKingdom)

    outFile.close()
    outFile2.close()

    print >> sys.stderr, 'STEP 190: output human readable report'
    # We can read the files we have written, sort, and spit back out with proper justification
    # make a function ?
    # we can check file size etc etc and look for errors
    numberTaxa = dict()
    outBig = open(outputPrefix + ".IMSA+A.HUMAN_READABLE_REPORT.txt", "w")

    numSpecies = 0
    numGenus = 0
    numFamily = 0
    for key in speciesCount.keys():
        if int(speciesCount[key][1]) > 0:
            numSpecies += 1

    for key in genusCount.keys():
        if int(genusCount[key][1]) > 0:
            numGenus += 1

    for key in familyCount.keys():
        if int(familyCount[key][1]) > 0:
            numFamily += 1

    outBig.write("IMSA+A metataxonomics report\n\nSUMMARY\n")
    outBig.write("{:10}".format((str(numFirstTaxon))) +
                 "Total unique lowest taxa identified\n")
    outBig.write("{:10}".format(str(numSpecies)) +
                 "Total unique Species identified\n")
    outBig.write("{:10}".format(str(numGenus)) +
                 "Total unique Genera identified\n")
    outBig.write("{:10}".format(str(numFamily)) +
                 "Total unique Families identified\n")
    outBig.write("\nCONTENT\n")
    outBig.write(
        "Section A - List of identified lowest taxon (for Virus detection)\n")
    outBig.write("Section B - List of identified species\n")
    outBig.write("Section C - List of identified genera (recommended)\n")
    outBig.write("Section D - List of identified families\n")
    outBig.write(
        "Section E - Errors from analysis requiring human intervention\n")
    outBig.write(
        "Section F - List of output files and their brief description\n")

    outBig.write("\nSection A\n\n")
    outBig.write(
        "Taxon at the lowest clade associated with the reference sequence.\n" +
        "This report is most useful for detecting viruses, which can be omitted in other reports.\n"
    )
    numberTaxa["firstTaxon"] = processOutputFileHumanReadable(
        outBig, outputPrefix + "firstTaxon.IMSA+A_4count.txt")

    outBig.write("\nSection B\n\n")
    outBig.write("List of identified species.\n\n")
    numberTaxa["species"] = processOutputFileHumanReadable(
        outBig, outputPrefix + "species.IMSA+A_4count.txt")

    outBig.write("\nSection C\n\n")
    outBig.write("List of identified genera.\n\n")
    numberTaxa["genus"] = processOutputFileHumanReadable(
        outBig, outputPrefix + "genus.IMSA+A_4count.txt")

    outBig.write("\nSection D\n\n")
    outBig.write(
        "List of identified families.\nThis report can be useful when the sequenced organism(s) in the sample is not in the reference database.\n"
    )
    outBig.write(
        "Reviewing results at higher clade levels may prevent the identification of many closely related organisms.\n\n"
    )
    numberTaxa["family"] = processOutputFileHumanReadable(
        outBig, outputPrefix + "family.IMSA+A_4count.txt")

    # outBig.write("\n\nNumber of taxa found by clade\n")
    # clades = ["firstTaxon","species","genus","family"]
    #
    # for key in clades:
    #     outBig.write("{:15}".format(numberTaxa[key])+ " " + key + "\n")

    outBig.write(
        "\nSection E\n\nErrors from analysis requiring human intervention\n\n")
    f = open(outputPrefix + "unidentifiedTaxaAlignments.txt")
    data = f.read()
    f.close()
    outBig.write("Sequence alignment conversion to taxa:\n")
    if len(data) != 0:
        outBig.write(
            "WARNING!\n" +
            "There were reference sequences which could not be converted to taxa.\n\n"
            + outputPrefix + "unidentifiedTaxaAlignments.txt" +
            " file contains blast alignments to reference sequence names.\n\n"
            +
            "These names must be manually looked up, input into appropriate files, and then the program re-run.\n"
            +
            "Please refer to documentation for detailed instructions to resolve this error, section 'Additional Output file'.\n\n"
        )
    else:
        outBig.write("SUCCESSFUL\n\n")

    f = open(outputPrefix + "unresolved_acc.txt")
    data = f.read()
    f.close()
    outBig.write("Reference sequence names conversion to taxa:\n")
    if len(data) != 0:
        outBig.write(
            "WARNING!\n" +
            "There were reference sequence GI numbers which could not be converted to taxa.\n\n"
            + outputPrefix + "unresolved_acc.txt" +
            " contains a list of Accession numbers.\n\n" +
            "These need to be manually looked up, input, and then process re-run\n"
            +
            "Please refer to documentation for detailed instructions to resolve this error, section 'Additional Output file'.\n"
        )
    else:
        outBig.write("SUCCESSFUL\n")

    outBig.write("\nSection F\n\n")
    outBig.write("\nList of output files and their brief description:\n")
    outBig.write("\t" + outputPrefix +
                 "species.IMSA_count.txt - Original IMSA report" + "\n")
    outBig.write("\t" + outputPrefix +
                 "genus.IMSA_count.txt   - Original IMSA report" + "\n")
    outBig.write("\t" + outputPrefix +
                 "family.IMSA_count.txt  - Original IMSA report" + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "firstTaxon.IMSA+A_4count.txt - IMSA+A detailed counts; use for further analysis"
        + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "species.IMSA+A_4count.txt    - IMSA+A detailed counts; use for further analysis"
        + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "genus.IMSA+A_4count.txt      - IMSA+A detailed counts; use for further analysis"
        + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "family.IMSA+A_4count.txt     - IMSA+A detailed counts; use for further analysis"
        + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "NAMES.txt         - list of taxonomies with names looked up via NCBI"
        + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "ACCS.txt          - list of Accession numbers (or other sequence names) found"
        + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "uniqueAlignments.txt           - query names resulting in unique hits across all clades and corresponding Taxon ID"
        + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "unidentifiedTaxaAlignments.txt - serious errors where reference sequence could not be converted to a taxon ID recorded here"
        + "\n")
    outBig.write(
        "\t" + outputPrefix +
        "unresolved_acc.txt             - serious errors where reference sequence could not be converted to a taxon ID recorded here"
        + "\n")
    outBig.write(
        "\t" + outputPrefix[:-4] +
        "taxonomy.pickle       - intermediate python binary file.  Delete if your blast alignments change."
        + "\n")
    outBig.write(
        "\t" + outputPrefix[:-4] +
        "bestHits.bln          - intermediate blast alignments, which are used for final counts.  Delete if your blast alignments change."
        + "\n")
    outBig.write(
        "\t" + outputPrefix[:-4] +
        "bestHits.bln.pickle   - intermediate python binary file.  Delete if your blast alignments change."
        + "\n")

    outBig.close()
Пример #4
0
# old way: filter just based on the score at the end of the line
# new way: look up the tax id using the gi then sum across tax id before filtering

gis = {}
for line in open(bestHitName):
    target = line.split()[1].split("|")[1]
    gis[target] = 1

taxonomy = {}
print "Getting taxonomy ids from local file for %s gis" % (len(gis))
for line in open(config.BLAST_TAX_DB):
    (gi, taxid) = line.split()
    if gis.has_key(gi):
        taxonomy[gi] = int(taxid)

fullTax = eutilsWrapper.getTaxa(taxonomy.values())

print "filtering blast results based on tax id"
countUsed = 0
countSkipped = 0
out = open(outputName, "w")
currentQuery = None
currentResults = []
notFoundTax = {}
countFoundTax = 0
for line in open(bestHitName):
    query = line.split()[0]

    if currentQuery == query:
        currentResults.append(line)
    else:
Пример #5
0
def reportTaxonomyJWC(blastInput, printTargets=False, outputPrefix=None, dotPrefix=None, dotLimit=1.0):
    """Given a blast input file, reports the taxonomy lineage for the hits.  By default the report
    will be printed to standard output, but if an outputPrefix is given then output files, one each
    for species, genus, family and division, will be created.
    The Blast input file should have already been run through a Best Blast filter if you want to only
    count unique hits.  Can have the extra counts column added, as per blastUtils.keepAllBestHits."""

    '''
    fungiLookup - dictionary mapping
        seqID (string) - the reference sequence name used for looking up blast hits
        to
        speciesID (string) - taxon ID number as text
    gis - dictionary mapping                (used as non-repeating list)
        gid (string) - the gid, the reference number used for looking up blast hits (as text)
        to 1
    taxID_to_lookup - dictionary mapping    (used as non-repeating list)
        taxID (int) - taxon ID to lookup
        to 1
    taxonomy - dictionary mapping
        gid (string)
        to
        taxID (int)
    names
        taxID (int)
        to
        eutilsWrapper.Taxonomy object
    4tuple
        (float, int, int, float) representing (IMSA count, unique count, partial count, partial sum)

    taxaCount = {}
    speciesCount = {}
    genusCount = {}
    familyCount = {}
        map
        taxID (int)
        to
        4tuple

    lookups -- dictionary to store eUtil lookup results
        taxID (int)
        to
          _variable_               _class_
        (newTaxaID,             (int)
        single[newTaxaID])      eutilsWrapper.Taxonomy
    '''

    # STEP 110: build fungiLookup
    print >> sys.stderr, 'STEP 110: build fungiLookup'
    fungiLookup = {}  # lookup table from file
    print >> sys.stderr, '        : using lookup file ', fungiLookupFile
    for line in open(fungiLookupFile, 'r'):
        splits = line.split('\t')
        key = splits[0]
        speciesID = int(splits[1])
        fungiLookup[key] = speciesID

    # STEP 120:
    print >> sys.stderr, 'STEP 120: extract gi targets, SID_to_lookup from file'

    #contains integer taxID to lookup in NCBI -- will contain FungiDB species and regular database taxID
    taxID_to_lookup = {}

    # dict containing gi's to lookup, requiring NCBI lookup
    gis = {}

    extractLookupInfo(gis, taxID_to_lookup, fungiLookup, blastInput)
        # gis is indexed by string numbers
        # taxID_to_lookup is indexed by integer numbers

    # STEP 130: write gis table to file
    print >> sys.stderr, 'STEP 130: write gis table to file'
    outGis = open(outputPrefix + "GIS.txt", "w")
    for key in gis:
        outGis.write("%s\n" % (key))
    outGis.close()

    # STEP 140: build taxonomy dictionary from gis -- parse local file
    print >> sys.stderr, 'STEP 140: build taxonomy dictionary from gis -- parse local file     LONG STEP'
    print >> sys.stderr, '       : checking for pickle'

    import cPickle as pickle
    if not os.path.exists(blastInput+".taxonomy.pickle"):
        print >> sys.stderr, '       : no pickle to load'
        taxonomy = {}
        print >> sys.stderr, "       : Getting taxonomy ids from local file for %s gis" % (len(gis))

        #short circuit if no need to parse file
        if len(gis)>0:
            print >> sys.stderr, '       :       : searching %s' % BLAST_TAX_DB
            for line in open(BLAST_TAX_DB):
                (gi, taxid) = line.split()
                if gis.has_key(gi):
                    taxonomy[gi] = int(taxid)
            print >> sys.stderr, '       :       : searching %s' % EXTRA_BLAST_TAX_DB
            for line in open(EXTRA_BLAST_TAX_DB):
                (gi, taxid) = line.split()
                if gis.has_key(gi):
                    taxonomy[gi] = int(taxid)

        print >> sys.stderr, '       : creating pickle on disk'
        pickle.dump( taxonomy, open( blastInput+".taxonomy.pickle", "wb" ) )
    else:
        print >> sys.stderr, '       : loading pickle'
        taxonomy = pickle.load(  open( blastInput+".taxonomy.pickle", "rb" )  )

    print >> sys.stderr, "       : Done getting taxonomy ids, got %s ids" % (len(taxonomy))


    # STEP 145: write Alignments to gi nubmers, which were not found
    print >> sys.stderr, 'STEP 145: write Alignments to gi numbers, which were not found'
    findingUnknownAlignments( taxonomy, blastInput, outputPrefix + "unidentifiedTaxaAlignments.txt")


    # STEP 150: build taxID_to_lookup from taxonomy
    # could be combined in step 140, but 1 pickle is enough
    print >> sys.stderr, 'STEP 150: increase taxID_to_lookup from taxonomy list'
    for k in taxonomy:
        taxID_to_lookup[ taxonomy[k] ] = 1  # writing integers as keys

    #gis contains gi keys as strings
    #taxonomy contains taxid (int) coded to gi keys (string)

    # STEP 160: get the scientific name for each taxa from DB
    print >> sys.stderr, 'STEP 160: get the scientific name for each taxa from DB'

    print >> sys.stderr, "       : Accessing NCBI for %d taxonomies" % len(taxID_to_lookup)
    #V2: getTaxa for GI alignments and the fungiDB alignments
    names = eutilsWrapper.getTaxa( taxID_to_lookup.keys() )   #lists of integers
    print >> sys.stderr, "       : Got %s taxonomies from NCBI" % (len(names))

    # STEP 165: write names to file
    print >> sys.stderr, 'STEP 165: write names to file'
    OPP = open(outputPrefix + "NAMES.txt", "w")
    for name in names:
        OPP.write("\n" + str(name) + "\n" + str(names[name]))
    OPP.close()


    # STEP 168: output taxa reports (LCA binning version)
    print >> sys.stderr, 'STEP 168: import ncbi gi database'
    print >> sys.stderr, '       : build taxaNames'
    taxaNames = buildNames()
    print >> sys.stderr, '       : build taxaNodes'
    (taxaNodes, levels) = buildNodes()


    # STEP 170: build allHits from blastInput: mapping query, to list of species ID that are hits

    print >> sys.stderr, 'STEP 170: build allHits from blastInput: mapping query, to list of species ID that are hits'

    currQuery = ""
    hitList = []
    first = True
    taxaCount = {}
    speciesCount = {}
    genusCount = {}
    familyCount = {}

    missing = open(outputPrefix + "unresolved_gi.txt",'w')
    uniqLog = open(outputPrefix + "uniqueAlignments.txt",'w')

    #temporary list to store values looked up via eUtils
    lookups = {}

    for line in open(blastInput, "r"):
        #loop control
        splits = line.split()

        if splits[0] != currQuery:
            #process stored output
            if first:
                first = False
                currQuery = splits[0]
            else:
                #process output stored in hitList
                processHitList(taxaCount, speciesCount, genusCount, familyCount, hitList, currQuery, uniqLog)

                #reset loop variables
                hitList = []
                currQuery = splits[0]

        #analysis of line - do every time, even if find end
        gi = 0
        j = 0
        k = splits[1]
        for i in k.split("|"):
            if i == "gi":
                gi = k.split("|")[j + 1]
                break
            j += 1
        #taxaID=-1
        if gi == 0 and k in fungiLookup:  #gi not found, cuz new fungiDB reference sequence name
            (taxaID, speciesID, genusID, familyID) = lookupFungi( k, fungiLookup, names)
        elif gi in taxonomy:  #2015-07-24
            taxID = taxonomy[gi]
            #taxaID = taxID
            if taxID in names:
                fullTax = names[taxID]
                taxaID = taxID
                (speciesID, genusID, familyID) = fullTax2IDS( fullTax )
            elif taxID in lookups.keys():
                #new condition: if I looked up and got a merged entry, I saved it and this looks it up
                (taxID, fullTax) = lookups[taxID]
                (speciesID, genusID, familyID) = fullTax2IDS( fullTax )
                taxaID = taxID
            else:
                # attempt to lookup missing values
                print >> sys.stderr, "taxID %d was not found in the NCBI lookup; it is being looked up again, possible merged record" % taxID
                #print "taxID %d was not found in the NCBI lookup; it is being looked up again, possible merged record" % taxID
                single =  eutilsWrapper.getTaxa( [taxID] )
                if len(single)==1:
                    newTaxaID = single.keys()[0]
                    lookups[taxID] = (newTaxaID, single[newTaxaID])
                    print >> sys.stderr, "taxID %d was found in the NCBI lookup as taxID %d" % (taxID, int(newTaxaID))
                    #print "taxID %d was found in the NCBI lookup as taxID %d" % (taxID, int(newTaxaID))
                    fullTax = single[newTaxaID]
                    (speciesID, genusID, familyID) = fullTax2IDS( fullTax )
                else:
                    print >> sys.stderr, "taxID %d was STILL not found in the NCBI lookup; it is being ignored" % taxID
                    missing.write("taxaID\t"+str(taxID)+"\n")
                    taxaID = -1
                    speciesID = -1
                    genusID = -1
                    familyID = -1
        else:
            missing.write("not in taxonomy DB\t"+line)
            taxaID = -1
            speciesID = -1
            genusID = -1
            familyID = -1
        #end if gi == 0:
        hitList.append( (taxaID, speciesID, genusID, familyID) )
    #end for
    processHitList(taxaCount, speciesCount, genusCount, familyCount, hitList, currQuery, uniqLog)
    missing.close()

    # STEP 180: output taxa reports (LCA binning version)
    print >> sys.stderr, 'STEP 180: output taxa reports (LCA binning version)'
    print >> sys.stderr, '       : writing reports'

    # 2016-12-08 need to save Kingdom information and pass forward
    savedKingdom = dict()

    filename = outputPrefix + "firstTaxon.IMSA+A_4count.txt"
    outFile = open(filename, "w")
    outFile.write( "%s\t%s\t%s\t%s\t%s\n" %
                ("Taxa ID", "Scientific Name", "IMSA+A count", "Kingdom","Total\tUnique clade hits\tPartial clade hits\tPartial clade sum"))
    KEYS=taxaCount.keys()
    KEYS.sort()
    numFirstTaxon = 0
    for taxID in KEYS:
        kingdom = 1
        if taxID == -1 or taxID == "-1":
            sname = -1
            count = taxaCount[taxID]
        elif taxID in taxaNames:    #if found in NCBI local database at systemSettings.PATH
            sname = taxaNames[taxID]
            count = taxaCount[taxID]
            kingdom = findKingdom(taxID,taxaNodes)
        else:
            # debug 2016-01-15 added step to prevent unknown/merged record to taxa report
            print "*&*&* Calling eUtils with taxID: ", taxID
            single = eutilsWrapper.getTaxa( [taxID] )
            print "*&*&* result of eUtils : ", single
            print "*&*&* result of eUtils single[0] : ", single[ single.keys()[0] ]
            if len(single)==1:
                count = taxaCount[taxID]
                try:
                    ID = int( single[ single.keys()[0] ].getSpecies().taxId )
                except:
                    print "*&*& could not look up taxaID ", taxID
                    ID = int(taxID)
                taxID = ID
                #print "*&*&* fullTax is ", single[ single.keys()[0] ]
                print "*&*&* taxID is ", taxID
                #sname = taxaNames[ taxID ]     #bug 12/06/2016
                sname = single[ single.keys()[0] ].getSpecies().sname
                print "*&*&* sname is ", sname
                if len(single.keys()) > 0:
                    if single[ single.keys()[0] ].getPhylum():
                        kingdom = findKingdom(single[ single.keys()[0] ].getPhylum().taxId, taxaNodes)
                    if single[single.keys()[0]].getSpecies():
                        savedKingdom[single[ single.keys()[0]].getSpecies().taxId ] = kingdom
                    if single[single.keys()[0]].getGenus():
                        savedKingdom[single[single.keys()[0]].getGenus().taxId] = kingdom
                    if single[single.keys()[0]].getFamily():
                        savedKingdom[single[single.keys()[0]].getFamily().taxId] = kingdom
            else:
                sname = -1
                count = taxaCount[taxID]

        if taxID in taxaNodes:
            level = taxaNodes[taxID][1]
        else:
            level = "unknown"
        #myLine = "%s\t%s\t%s\t%s\n" % (str(taxID), sname, level, catV(count))
        myLine = "%s\t%s\t%s\t%s\t%s\n" % (str(taxID), sname, count[1], taxaNames[kingdom], catV(count))
        if int(count[1]) > 0:
            numFirstTaxon += 1
        outFile.write(myLine)
    outFile.close()

    filename = outputPrefix + "species.IMSA+A_4count.txt"
    outFile = open(filename, "w")
    filename2 = outputPrefix + "species.IMSA_count.txt"
    outFile2 = open(filename2, "w")
    KEYS=speciesCount.keys()
    KEYS.sort()
    fillOutCountFiles(KEYS, speciesCount, taxaNames, taxaNodes, outFile, outFile2, "Species", savedKingdom)
    outFile.close()
    outFile2.close()


    filename = outputPrefix + "genus.IMSA+A_4count.txt"
    outFile = open(filename, "w")
    filename2 = outputPrefix + "genus.IMSA_count.txt"
    outFile2 = open(filename2, "w")
    KEYS=genusCount.keys()
    KEYS.sort()
    fillOutCountFiles(KEYS, genusCount, taxaNames, taxaNodes, outFile, outFile2, "Genus", savedKingdom)
    outFile.close()
    outFile2.close()


    filename = outputPrefix + "family.IMSA+A_4count.txt"
    outFile = open(filename, "w")
    filename2 = outputPrefix + "family.IMSA_count.txt"
    outFile2 = open(filename2, "w")

    KEYS=familyCount.keys()
    KEYS.sort()

    fillOutCountFiles(KEYS, familyCount, taxaNames, taxaNodes, outFile, outFile2, "Family", savedKingdom)

    outFile.close()
    outFile2.close()


    print >> sys.stderr, 'STEP 190: output human readable report'
    # We can read the files we have written, sort, and spit back out with proper justification
    # make a function ?
    # we can check file size etc etc and look for errors
    numberTaxa = dict()
    outBig = open(outputPrefix + ".IMSA+A.HUMAN_READABLE_REPORT.txt","w")

    numSpecies = 0
    numGenus = 0
    numFamily = 0
    for key in speciesCount.keys():
        if int(speciesCount[key][1]) > 0:
            numSpecies += 1

    for key in genusCount.keys():
        if int(genusCount[key][1]) > 0:
            numGenus += 1

    for key in familyCount.keys():
        if int(familyCount[key][1]) > 0:
            numFamily += 1

    outBig.write("IMSA+A metataxonomics report\n\nSUMMARY\n")
    outBig.write("{:10}".format((str(numFirstTaxon)))+"Total unique lowest taxa identified\n")
    outBig.write("{:10}".format(str(numSpecies)) + "Total unique Species identified\n")
    outBig.write("{:10}".format(str(numGenus)) + "Total unique Genera identified\n")
    outBig.write("{:10}".format(str(numFamily)) + "Total unique Families identified\n")
    outBig.write("\nCONTENT\n")
    outBig.write("Section A - List of identified lowest taxon (for Virus detection)\n")
    outBig.write("Section B - List of identified species\n")
    outBig.write("Section C - List of identified genera (recommended)\n")
    outBig.write("Section D - List of identified families\n")
    outBig.write("Section E - Errors from analysis requiring human intervention\n")
    outBig.write("Section F - List of output files and their brief description\n")

    outBig.write("\nSection A\n\n")
    outBig.write("Taxon at the lowest clade associated with the reference sequence.\n"+
                 "This report is most useful for detecting viruses, which can be omitted in other reports.\n")
    numberTaxa["firstTaxon"] = processOutputFileHumanReadable( outBig, outputPrefix + "firstTaxon.IMSA+A_4count.txt" )

    outBig.write("\nSection B\n\n")
    outBig.write("List of identified species.\n\n")
    numberTaxa["species"] = processOutputFileHumanReadable( outBig, outputPrefix + "species.IMSA+A_4count.txt" )


    outBig.write("\nSection C\n\n")
    outBig.write("List of identified genera.\n\n")
    numberTaxa["genus"] = processOutputFileHumanReadable( outBig, outputPrefix + "genus.IMSA+A_4count.txt" )

    outBig.write("\nSection D\n\n")
    outBig.write("List of identified families.\nThis report can be useful when the sequenced organism(s) in the sample is not in the reference database.\n")
    outBig.write("Reviewing results at higher clade levels may prevent the identification of many closely related organisms.\n\n")
    numberTaxa["family"] = processOutputFileHumanReadable(outBig, outputPrefix + "family.IMSA+A_4count.txt")

    # outBig.write("\n\nNumber of taxa found by clade\n")
    # clades = ["firstTaxon","species","genus","family"]
    #
    # for key in clades:
    #     outBig.write("{:15}".format(numberTaxa[key])+ " " + key + "\n")


    outBig.write("\nSection E\n\nErrors from analysis requiring human intervention\n\n")
    f = open(outputPrefix + "unidentifiedTaxaAlignments.txt")
    data = f.read()
    f.close()
    outBig.write("Sequence alignment conversion to taxa:\n")
    if len(data) != 0:
        outBig.write("WARNING!\n"+
                     "There were reference sequences which could not be converted to taxa.\n\n"+
                     outputPrefix + "unidentifiedTaxaAlignments.txt" + " file contains blast alignments to reference sequence names.\n\n"+
                     "These names must be manually looked up, input into appropriate files, and then the program re-run.\n"+
                     "Please refer to documentation for detailed instructions to resolve this error, section 'Additional Output file'.\n\n")
    else:
        outBig.write("SUCCESSFUL\n\n")

    outBig.write("Reference sequence names conversion to taxa:\n")
    f = open(outputPrefix + "unresolved_gi.txt")
    data = f.read()
    f.close()

    if len(data) != 0:
        outBig.write("WARNING!\n"+
                     "There were reference sequence GI numbers which could not be converted to taxa.\n\n" +
                     outputPrefix + "unresolved_gi.txt"+ " contains a list of GI numbers.\n\n"+
                     "These need to be manually looked up, input, and then process re-run\n"+
                     "Please refer to documentation for detailed instructions to resolve this error, section 'Additional Output file'.\n")
    else:
        outBig.write("SUCCESSFUL\n")


    outBig.write("\nSection F\n\n")
    outBig.write("\nList of output files and their brief description:\n")
    outBig.write("\t" + outputPrefix +          "species.IMSA_count.txt - Original IMSA report" + "\n")
    outBig.write("\t" + outputPrefix +          "genus.IMSA_count.txt   - Original IMSA report" + "\n")
    outBig.write("\t" + outputPrefix +          "family.IMSA_count.txt  - Original IMSA report"+"\n")
    outBig.write("\t" + outputPrefix +          "firstTaxon.IMSA+A_4count.txt - IMSA+A detailed counts; use for further analysis" + "\n")
    outBig.write("\t" + outputPrefix +          "species.IMSA+A_4count.txt    - IMSA+A detailed counts; use for further analysis" + "\n")
    outBig.write("\t" + outputPrefix +          "genus.IMSA+A_4count.txt      - IMSA+A detailed counts; use for further analysis" + "\n")
    outBig.write("\t" + outputPrefix +          "family.IMSA+A_4count.txt     - IMSA+A detailed counts; use for further analysis" + "\n")
    outBig.write("\t" + outputPrefix +          "NAMES.txt         - list of taxonomies with names looked up via NCBI" + "\n")
    outBig.write("\t" + outputPrefix +          "GIS.txt           - list of GI numbers (or other sequence names) found" + "\n")
    outBig.write("\t" + outputPrefix +          "uniqueAlignments.txt           - query names resulting in unique hits across all clades and corresponding Taxon ID" + "\n")
    outBig.write("\t" + outputPrefix +          "unidentifiedTaxaAlignments.txt - serious errors where reference sequence could not be converted to a taxon ID recorded here" + "\n")
    outBig.write("\t" + outputPrefix +          "unresolved_gi.txt              - serious errors where reference sequence could not be converted to a taxon ID recorded here" + "\n")
    outBig.write("\t" + outputPrefix[:-4] + "taxonomy.pickle       - intermediate python binary file.  Delete if your blast alignments change." + "\n")
    outBig.write("\t" + outputPrefix[:-4] + "bestHits.bln          - intermediate blast alignments, which are used for final counts.  Delete if your blast alignments change." + "\n")
    outBig.write("\t" + outputPrefix[:-4] + "bestHits.bln.pickle   - intermediate python binary file.  Delete if your blast alignments change." + "\n")

    outBig.close()
Пример #6
0
# old way: filter just based on the score at the end of the line
# new way: look up the tax id using the gi then sum across tax id before filtering

gis = {}
for line in open(bestHitName):
    target = line.split()[1].split("|")[1]
    gis[target] = 1

taxonomy = {}
print "Getting taxonomy ids from local file for %s gis" % (len(gis))
for line in open(config.BLAST_TAX_DB):
    (gi, taxid) = line.split()
    if gis.has_key(gi):
        taxonomy[gi] = int(taxid)

fullTax = eutilsWrapper.getTaxa(taxonomy.values())
                                    
print "filtering blast results based on tax id"
countUsed = 0
countSkipped = 0
out = open(outputName, "w")
currentQuery = None
currentResults = []
notFoundTax = {}
countFoundTax = 0
for line in open(bestHitName):
    query = line.split()[0]

    if currentQuery == query:
        currentResults.append(line)
    else: