示例#1
0
 def __init__(self, taxonomy):
     """ A taxonomy wrapper.
         @param taxonomy: file path to the ncbi taxonomy database in the sqlite3
         @type taxonomy: str
     """
     self._taxonomy = tax.TaxonomyNcbi(taxonomy, considerNoRank=True)
     self._existsTaxonIdSet = set()
     self._taxonIdToDirectChildrenSet = {}
示例#2
0
def ppOut2PPSout():
    inFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.txt'
    outFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.PPS.txt'
    dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db'  #DB
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile)

    out = csv.OutFileBuffer(outFile)

    csv.forEachLine(inFile, PP2PPSoutParser(taxonomy, out))

    out.close()
示例#3
0
    def __init__(self, databaseFile):
        self._taxonomy = taxonomy_ncbi.TaxonomyNcbi(databaseFile)
        self._rankToId = {}
        self._ncbidToRankId = {}
        self._predAtRankId = {}  # rankId -> ncbid -> ncbid at given rank
        self._noDefAtRankId = {}  # rankId -> set of ncbids for which the ncbid at given rank is not defined
        self._ncbidToNcbidParent = {}  # ncbid -> parent ncbid

        id = 0
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS:
            self._rankToId[rank] = id
            self._predAtRankId[id] = {}
            self._noDefAtRankId[id] = set()
            id += 1
 def __init__(self,
              taxonomyFile,
              allowedRanks=[
                  'root', 'superkingdom', 'phylum', 'class', 'order',
                  'family', 'genus', 'species'
              ]):
     """
         Represents the ncbi taxonomy, buffers entries to efficiently compute the path to the root.
         @param taxonomyFile: database in sqlite3 format
         @param allowedRanks:
     """
     self._taxonomy = tax.TaxonomyNcbi(taxonomyFile, allowedRanks)
     self._allowedRanks = allowedRanks
     self._parentDict = {}  # map: taxonId -> parent taxonId
     self._rankDict = {}  # map: taxonId -> rank
示例#5
0
def genomesToMask():
    rank = 'genus'  #which rank will be masked
    fileName = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs_genus_ncbids.txt'
    outFile = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_genus_masked.txt'
    outFile2 = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_ncbids_genus.txt'
    #outFile = '/Users/ivan/Documents/work/binning/data/V35/genome_species_masked.txt' #output file
    #outFile2 = '/Users/ivan/Documents/work/binning/data/V35/genome_ncbids_species.txt' #output file
    #fileName='/Users/ivan/Documents/work/binning/data/V35/genome_ncbids.txt' #list of all genome ncbids
    dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db'  #DB
    out = csv.OutFileBuffer(outFile)
    out2 = csv.OutFileBuffer(outFile2)

    genomeNcbids = csv.getColumnAsList(fileName,
                                       entryModifyFunction=None,
                                       colNum=0,
                                       sep=None,
                                       comment='#')
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile)

    maskNcbids = []
    #print len(genomeNcbids), genomeNcbids
    for ncbid in genomeNcbids:
        while taxonomy.getRank(ncbid) != rank:
            ncbid = taxonomy.getParentNcbid(ncbid)
            if int(ncbid) == 1:
                print 'root reached!'
                break
        maskNcbids.append(int(ncbid))

    #print len(Set(maskNcbids)), maskNcbids

    maskSet = set(maskNcbids)
    for i in maskSet:
        out2.writeText(str(str(i) + '\n'))

    resultList = []
    for ncbid in maskSet:
        list = collectChildren(taxonomy, ncbid)
        for i in list:
            out.writeText(str(str(i) + '\n'))
        print ncbid, list

    #print taxonomy.childrenNcbids(818) #997888,818

    out.close()
    out2.close()
    taxonomy.close()
示例#6
0
 def __init__(self, databaseFile):
     """
         Taxonomy wrapper that buffers frequently used operations for this module.
         @param databaseFile: database in the sqlite3 format
     """
     self._taxonomy = taxonomy_ncbi.TaxonomyNcbi(databaseFile)
     # buffers
     self._rankToRankId = {}
     self._rankIdToRank = {}
     self._taxonIdToParentTaxonId = {}
     self._taxonIdToRankId = {}
     self._taxonIdToScientificName = {}
     # map: rank <-> rankId
     rankId = 0
     for rank in taxonomy_ncbi.TAXONOMIC_RANKS:
         self._rankToRankId[rank] = rankId
         self._rankIdToRank[rankId] = rank
         rankId += 1
示例#7
0
def refToClades(refDir, taxonomyFile, rank='species', outFile=None):
    """
        Returns (stores) a list of all clades (at the given rank) sorted according to the abundance of
        the individual clades. Abundance in respect to the size of the reference data available.

        @param refDir: directory containing reference data (as needed for PPS)
        @param taxonomyFile: ncbi taxonomy in the sqlite3 format
        @param rank: consider clades at this rank
        @param outFile: tab sep file, first column taxon id, second column number of bp (can be None)
        @return: list of tuples (clade, bp)
    """
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyFile)
    cladeNcbiToBp = {}
    for fileName in os.listdir(refDir):
        size = os.path.getsize(os.path.join(refDir, fileName))
        ncbid = int(fileName.rsplit('.', 2)[0])
        current = ncbid
        while (current is not None) and (taxonomy.getRank(current) != rank):
            current = taxonomy.getParentNcbid(int(current))
        if current is not None:
            if current in cladeNcbiToBp:
                cladeNcbiToBp[current] += size
            else:
                cladeNcbiToBp[current] = size
        else:
            print(
                'There is no ncbi taxon id defined at rank %s for ncbi taxon id %s'
                % (rank, ncbid))
    taxonomy.close()

    tuples = []
    for ncbid, size in cladeNcbiToBp.iteritems():
        tuples.append((ncbid, size))
    tuples.sort(key=lambda x: x[1], reverse=True)

    if outFile is not None:
        out = csv.OutFileBuffer(outFile)
        for t in tuples:
            out.writeText(str(t[0]) + '\t' + str(t[1]) + '\n')
        out.close()

    return tuples
示例#8
0
def getFirstLabelAtAllowedRank():
    rank = 'species'  # !!!!!!!

    predFile1 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'
    predFile2 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt'
    seqIdToLabel = csv.getMapping(predFile1, 0, 1, sep='\t', comment='#')
    outPred = csv.OutFileBuffer(predFile2)

    taxonomy = tax.TaxonomyNcbi(
        '/net/metagenomics/projects/PPSmg/data/nobackup/NCBI20120828/ncbiTax/ncbitax_sqlite.db'
    )

    for seqId in seqIdToLabel:
        ncbid = int(seqIdToLabel[seqId][0])
        while not taxonomy.isRankNcbidAllowed(ncbid):
            ncbid = taxonomy.getParentNcbid(ncbid)
        outPred.writeText(str(seqId + '\t' + str(ncbid) + '\n'))

    taxonomy.close()
    outPred.close()
示例#9
0
    def __init__(self, refDir, databaseFilePath):
        """
            Provides information about NCBI reference sequences (genomes or draft genomes).

            @param refDir: directory that contains reference sequences,
                each file has format ncbi_taxon_id.[0-9]+.fna(fas), for instance 382638.1.fna or 2110.1.fas
            @param databaseFilePath: ncbi taxonomy file in sqlite3 format
        """
        assert os.path.isdir(refDir)
        assert os.path.isfile(databaseFilePath)
        self._taxonIdSet = set()  # taxonIds in the reference
        self._taxonIdToSize = {}  # taxonId -> cumulative file size
        for fileName in os.listdir(refDir):
            if fileName.endswith(('.fna', '.fas')):
                taxonId = int(fileName[0:fileName.index('.')])
                self._taxonIdSet.add(taxonId)
                fileSize = int(os.path.getsize(os.path.join(refDir, fileName)))
                if taxonId in self._taxonIdToSize:
                    self._taxonIdToSize[taxonId] += fileSize
                else:
                    self._taxonIdToSize[taxonId] = fileSize
        self._taxonomy = taxonomy_ncbi.TaxonomyNcbi(databaseFilePath, considerNoRank=True)
        self._childrenBuffer = {}  # taxonId -> set of children taxon Ids
        self._rankBuffer = {}  # taxonId -> rank
示例#10
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir,
                            ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName,
                            modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [
            workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir,
            outputDir, ppsInstallDir, ppsScripts,
            os.path.dirname(predictLogFileName)
    ]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(
                    d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(
                        taxonId) + '|' + dName + '|' + seqId + '|label:' + str(
                            taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(
            os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile +
            ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(
            predictCmd,
            shell=True,
            bufsize=-1,
            cwd=ppsInstallDir,
            stdout=logOut,
            stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception(
                "PPS 'predict' training data returned with non-zero status: %s, cmd: %s"
                % (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int,
                                csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(
            set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split(
                '|',
                2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred,
                            seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc,
                            seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred,
                                          seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(
            rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub,
                                seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(
            acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                 minFracClade=None,
                                 minFracPred=None,
                                 overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(
            seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
            taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
示例#11
0
def getProfile(readsFFastaFile, communityFile, contigMFastaFile,
               contigLFastaFile, taxonomyMFile, taxonomyDbFile,
               outProfileFile):
    """
        Gets the profile of the dataset.

        @param readsFFastaFile:
        @param communityFile:
        @param contigMFastaFile:
        @param contigLFastaFile:
        @param taxonomyMFile:
        @param taxonomyDbFile: taxonomy in the sqlite3 format
        @param outProfileFile: output file
    """
    # get map: taxonId -> read count
    taxonIdToReadCount = {}
    readTotalCount = 0
    for taxonId in getReadsTaxonIdList(
            readsFFastaFile, communityFile,
            readHeaderToCommunityId=getCommunityId)[1:]:
        if taxonId in taxonIdToReadCount:
            taxonIdToReadCount[taxonId] += 1
        else:
            taxonIdToReadCount[taxonId] = 1
        readTotalCount += 1

    # get map: taxonId -> contig count
    # get map: taxonId -> contig bp
    taxonIdToContigCount = {}
    taxonIdToContigBp = {}
    totalContigCount = 0
    seqIdToTaxonId = csv.predToDict(taxonomyMFile)
    seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile)
    for seqId, bp in seqIdToBp.iteritems():
        totalContigCount += 1
        taxonId = seqIdToTaxonId[seqId]
        if taxonId in taxonIdToContigBp:
            taxonIdToContigBp[taxonId] += bp
        else:
            taxonIdToContigBp[taxonId] = bp
        if taxonId in taxonIdToContigCount:
            taxonIdToContigCount[taxonId] += 1
        else:
            taxonIdToContigCount[taxonId] = 1

    taxonIdToTotalBp = {}
    taxonIdToAvgSumCov = {}
    taxonIdToAvgCov = {}
    totalBp = 0.0
    for taxonId in taxonIdToContigBp:
        taxonIdToTotalBp[taxonId] = 0.0
        taxonIdToAvgSumCov[taxonId] = 0.0
        taxonIdToAvgCov[taxonId] = 0.0

    for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile):
        shortSeqId = getShortContigId(seqId)
        if shortSeqId in seqIdToBp:
            coverage = getCoverage(seqId)
            bp = seqIdToBp[shortSeqId]
            taxonId = seqIdToTaxonId[shortSeqId]
            taxonIdToTotalBp[taxonId] += bp
            taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp)
            totalBp += bp

    for taxonId, bp in taxonIdToTotalBp.iteritems():
        if bp > 0:
            taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp)

    tupleList = []
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True)
    ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:]
    avgCoverage = 0.0
    for taxonId, readCount in taxonIdToReadCount.iteritems():
        scName = ScientificNameAtRank(taxonId, taxonomy, ranks)
        tupleList.append((
            taxonId,
            round(100 * (readCount / float(readTotalCount)), 1),
            round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)),
                  1),
            round(taxonIdToAvgCov.get(taxonId, 0), 2),
            round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2),
            taxonIdToContigCount.get(taxonId, 0),
            taxonomy.getScientificName(taxonId),
            scName.getNameAtRank('phylum'),
            scName.getNameAtRank('class'),
            scName.getNameAtRank('order'),
            scName.getNameAtRank('family'),
            scName.getNameAtRank('genus'),
            scName.getNameAtRank(
                'species')  # this could be done in a nicer way
        ))

        avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(
            taxonId, 0)
    avgCoverage /= float(totalBp)
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = csv.OutFileBuffer(outProfileFile)
    out.writeText(
        '#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, '
        + ",".join(ranks) + '\n')
    for entry in tupleList:
        out.writeText(','.join(map(str, entry)) + '\n')

    out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' +
                  str(round(totalBp / 1000000.0, 2)) + ', ' +
                  str(totalContigCount) + ', -\n')
    out.close()
    taxonomy.close()