def __init__(self, taxonomy): """ A taxonomy wrapper. @param taxonomy: file path to the ncbi taxonomy database in the sqlite3 @type taxonomy: str """ self._taxonomy = tax.TaxonomyNcbi(taxonomy, considerNoRank=True) self._existsTaxonIdSet = set() self._taxonIdToDirectChildrenSet = {}
def ppOut2PPSout(): inFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.txt' outFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.PPS.txt' dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db' #DB taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile) out = csv.OutFileBuffer(outFile) csv.forEachLine(inFile, PP2PPSoutParser(taxonomy, out)) out.close()
def __init__(self, databaseFile): self._taxonomy = taxonomy_ncbi.TaxonomyNcbi(databaseFile) self._rankToId = {} self._ncbidToRankId = {} self._predAtRankId = {} # rankId -> ncbid -> ncbid at given rank self._noDefAtRankId = {} # rankId -> set of ncbids for which the ncbid at given rank is not defined self._ncbidToNcbidParent = {} # ncbid -> parent ncbid id = 0 for rank in taxonomy_ncbi.TAXONOMIC_RANKS: self._rankToId[rank] = id self._predAtRankId[id] = {} self._noDefAtRankId[id] = set() id += 1
def __init__(self, taxonomyFile, allowedRanks=[ 'root', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]): """ Represents the ncbi taxonomy, buffers entries to efficiently compute the path to the root. @param taxonomyFile: database in sqlite3 format @param allowedRanks: """ self._taxonomy = tax.TaxonomyNcbi(taxonomyFile, allowedRanks) self._allowedRanks = allowedRanks self._parentDict = {} # map: taxonId -> parent taxonId self._rankDict = {} # map: taxonId -> rank
def genomesToMask(): rank = 'genus' #which rank will be masked fileName = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs_genus_ncbids.txt' outFile = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_genus_masked.txt' outFile2 = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_ncbids_genus.txt' #outFile = '/Users/ivan/Documents/work/binning/data/V35/genome_species_masked.txt' #output file #outFile2 = '/Users/ivan/Documents/work/binning/data/V35/genome_ncbids_species.txt' #output file #fileName='/Users/ivan/Documents/work/binning/data/V35/genome_ncbids.txt' #list of all genome ncbids dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db' #DB out = csv.OutFileBuffer(outFile) out2 = csv.OutFileBuffer(outFile2) genomeNcbids = csv.getColumnAsList(fileName, entryModifyFunction=None, colNum=0, sep=None, comment='#') taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile) maskNcbids = [] #print len(genomeNcbids), genomeNcbids for ncbid in genomeNcbids: while taxonomy.getRank(ncbid) != rank: ncbid = taxonomy.getParentNcbid(ncbid) if int(ncbid) == 1: print 'root reached!' break maskNcbids.append(int(ncbid)) #print len(Set(maskNcbids)), maskNcbids maskSet = set(maskNcbids) for i in maskSet: out2.writeText(str(str(i) + '\n')) resultList = [] for ncbid in maskSet: list = collectChildren(taxonomy, ncbid) for i in list: out.writeText(str(str(i) + '\n')) print ncbid, list #print taxonomy.childrenNcbids(818) #997888,818 out.close() out2.close() taxonomy.close()
def __init__(self, databaseFile): """ Taxonomy wrapper that buffers frequently used operations for this module. @param databaseFile: database in the sqlite3 format """ self._taxonomy = taxonomy_ncbi.TaxonomyNcbi(databaseFile) # buffers self._rankToRankId = {} self._rankIdToRank = {} self._taxonIdToParentTaxonId = {} self._taxonIdToRankId = {} self._taxonIdToScientificName = {} # map: rank <-> rankId rankId = 0 for rank in taxonomy_ncbi.TAXONOMIC_RANKS: self._rankToRankId[rank] = rankId self._rankIdToRank[rankId] = rank rankId += 1
def refToClades(refDir, taxonomyFile, rank='species', outFile=None): """ Returns (stores) a list of all clades (at the given rank) sorted according to the abundance of the individual clades. Abundance in respect to the size of the reference data available. @param refDir: directory containing reference data (as needed for PPS) @param taxonomyFile: ncbi taxonomy in the sqlite3 format @param rank: consider clades at this rank @param outFile: tab sep file, first column taxon id, second column number of bp (can be None) @return: list of tuples (clade, bp) """ taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyFile) cladeNcbiToBp = {} for fileName in os.listdir(refDir): size = os.path.getsize(os.path.join(refDir, fileName)) ncbid = int(fileName.rsplit('.', 2)[0]) current = ncbid while (current is not None) and (taxonomy.getRank(current) != rank): current = taxonomy.getParentNcbid(int(current)) if current is not None: if current in cladeNcbiToBp: cladeNcbiToBp[current] += size else: cladeNcbiToBp[current] = size else: print( 'There is no ncbi taxon id defined at rank %s for ncbi taxon id %s' % (rank, ncbid)) taxonomy.close() tuples = [] for ncbid, size in cladeNcbiToBp.iteritems(): tuples.append((ncbid, size)) tuples.sort(key=lambda x: x[1], reverse=True) if outFile is not None: out = csv.OutFileBuffer(outFile) for t in tuples: out.writeText(str(t[0]) + '\t' + str(t[1]) + '\n') out.close() return tuples
def getFirstLabelAtAllowedRank(): rank = 'species' # !!!!!!! predFile1 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' predFile2 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt' seqIdToLabel = csv.getMapping(predFile1, 0, 1, sep='\t', comment='#') outPred = csv.OutFileBuffer(predFile2) taxonomy = tax.TaxonomyNcbi( '/net/metagenomics/projects/PPSmg/data/nobackup/NCBI20120828/ncbiTax/ncbitax_sqlite.db' ) for seqId in seqIdToLabel: ncbid = int(seqIdToLabel[seqId][0]) while not taxonomy.isRankNcbidAllowed(ncbid): ncbid = taxonomy.getParentNcbid(ncbid) outPred.writeText(str(seqId + '\t' + str(ncbid) + '\n')) taxonomy.close() outPred.close()
def __init__(self, refDir, databaseFilePath): """ Provides information about NCBI reference sequences (genomes or draft genomes). @param refDir: directory that contains reference sequences, each file has format ncbi_taxon_id.[0-9]+.fna(fas), for instance 382638.1.fna or 2110.1.fas @param databaseFilePath: ncbi taxonomy file in sqlite3 format """ assert os.path.isdir(refDir) assert os.path.isfile(databaseFilePath) self._taxonIdSet = set() # taxonIds in the reference self._taxonIdToSize = {} # taxonId -> cumulative file size for fileName in os.listdir(refDir): if fileName.endswith(('.fna', '.fas')): taxonId = int(fileName[0:fileName.index('.')]) self._taxonIdSet.add(taxonId) fileSize = int(os.path.getsize(os.path.join(refDir, fileName))) if taxonId in self._taxonIdToSize: self._taxonIdToSize[taxonId] += fileSize else: self._taxonIdToSize[taxonId] = fileSize self._taxonomy = taxonomy_ncbi.TaxonomyNcbi(databaseFilePath, considerNoRank=True) self._childrenBuffer = {} # taxonId -> set of children taxon Ids self._rankBuffer = {} # taxonId -> rank
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile): """ Computes the training accuracy for the PPS training data. This function doesn't consider training data used to train intermediate (misc?) nodes! The training data that correspond to the sample specific data is fragmented (via PPS) and contained in the training data of different lengths. @param workingDir: working directory of the PPS+ pipeline @param taWorkingDir: working directory for the accuracy computation @param sampleSpecificDir: directory containing the sample specific data @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data @param outputDir: directory for output files @param ppsScripts: directory containing PPS scripts @param ppsConfigFilePath: the PPS configuration file @param ppsInstallDir: directory where PPS is installed @param predictLogFileName: logging file for PPS prediction @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled @param databaseFile: ncbi taxonomy file in the sqlite3 format """ for d in [ workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName) ]: assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]: assert os.path.isfile(f), "File '%s' doesn't exist!" % f # all directories that contain PPS training data trainDirList = [sampleSpecificDir] for d in os.listdir(ppsTrainDataDir): trainDirList.append(os.path.join(ppsTrainDataDir, d)) # fasta file with all training sequences allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna') out = csv.OutFileBuffer(allTrainFastaFile) seqIdToTruePred = {} # merge all training fasta files to one fasta file for d in trainDirList: dName = os.path.basename(d) for f in os.listdir(d): taxonId = int(os.path.basename(f).rsplit('.', 2)[0]) for seqId, seq in fasta.fastaFileToDict(os.path.join( d, f)).iteritems(): if d == sampleSpecificDir: #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) id = str( taxonId) + '|' + dName + '|' + seqId + '|label:' + str( taxonId) else: id = str(taxonId) + '|' + dName + '|' + seqId out.writeText('>' + id + '\n' + seq + '\n') seqIdToTruePred[id] = taxonId out.close() # predict the merged file using the generated model if os.name == 'posix': predictCmd = str( os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath) #print(predictCmd) logOut = open(predictLogFileName, 'w') predictProc = subprocess.Popen( predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut, stderr=subprocess.STDOUT) # stdout=subprocess.STDOUT predictProc.wait() logOut.close() if predictProc.returncode != 0: raise Exception( "PPS 'predict' training data returned with non-zero status: %s, cmd: %s" % (predictProc.returncode, predictCmd)) else: print("Can't run PPS on a non-posix system!") return # read in predicted train data seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out') # read fasta file seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile) # leaf taxonIds that are modelled modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath))) taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True) notLeafTaxonIds = set() for id in modelLeafTaxonIds: notLeafTaxonIds.update( set(map(int, (taxonomyS.getParentsNcbidSet(id))))) taxonomyS.close() # get only sequences with true taxonId defined at leaf level that is modelled or lower seqIdToBp2 = {} seqIdToPred2 = {} seqIdToTruePred2 = {} seqIdToBpMisc = {} seqIdToPredMisc = {} seqIdToTruePredMisc = {} for seqId, bp in seqIdToBp.iteritems(): label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) if label not in notLeafTaxonIds: seqIdToBp2[seqId] = bp seqIdToPred2[seqId] = seqIdToPred[seqId] seqIdToTruePred2[seqId] = seqIdToTruePred[seqId] else: seqIdToBpMisc[seqId] = bp seqIdToPredMisc[seqId] = seqIdToPred[seqId] seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId] seqIdToBp = seqIdToBp2 seqIdToPred = seqIdToPred2 seqIdToTruePred = seqIdToTruePred2 # accuracy for all, filter out sample specific data (whole length) seqIdToBpNoSampleSpec = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split( '|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip(): seqIdToBpNoSampleSpec[seqId] = bp acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() taxonomyA = acc.getTaxonomy() acc.close(closeTaxonomy=False) # accuracy for (misc) nodes acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() acc.close(closeTaxonomy=False) # generate the confusion matrices (for the "for all" scenario) cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_all')) taxonomyCM = cm.getTaxonomy() cm.close(closeTaxonomy=False) # accuracy for individual directories (seq lengths) # (the sample specific fragments are among PPS sampled fasta) for d in trainDirList: dName = os.path.basename(d) seqIdToBpSub = {} seqIdToPredSub = {} seqIdToTruePredSub = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() == str(dName).strip(): seqIdToBpSub[seqId] = seqIdToBp[seqId] seqIdToPredSub[seqId] = seqIdToPred[seqId] seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId] # accuracy acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA) out = csv.OutFileBuffer( os.path.join(outputDir, 'train_accuracy_' + dName + '.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) # confusion matrices cm = confusion_matrix.ConfusionMatrix( seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName)) cm.close(closeTaxonomy=False) out.close() acc.close(closeTaxonomy=False) taxonomyA.close() taxonomyCM.close()
def getProfile(readsFFastaFile, communityFile, contigMFastaFile, contigLFastaFile, taxonomyMFile, taxonomyDbFile, outProfileFile): """ Gets the profile of the dataset. @param readsFFastaFile: @param communityFile: @param contigMFastaFile: @param contigLFastaFile: @param taxonomyMFile: @param taxonomyDbFile: taxonomy in the sqlite3 format @param outProfileFile: output file """ # get map: taxonId -> read count taxonIdToReadCount = {} readTotalCount = 0 for taxonId in getReadsTaxonIdList( readsFFastaFile, communityFile, readHeaderToCommunityId=getCommunityId)[1:]: if taxonId in taxonIdToReadCount: taxonIdToReadCount[taxonId] += 1 else: taxonIdToReadCount[taxonId] = 1 readTotalCount += 1 # get map: taxonId -> contig count # get map: taxonId -> contig bp taxonIdToContigCount = {} taxonIdToContigBp = {} totalContigCount = 0 seqIdToTaxonId = csv.predToDict(taxonomyMFile) seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile) for seqId, bp in seqIdToBp.iteritems(): totalContigCount += 1 taxonId = seqIdToTaxonId[seqId] if taxonId in taxonIdToContigBp: taxonIdToContigBp[taxonId] += bp else: taxonIdToContigBp[taxonId] = bp if taxonId in taxonIdToContigCount: taxonIdToContigCount[taxonId] += 1 else: taxonIdToContigCount[taxonId] = 1 taxonIdToTotalBp = {} taxonIdToAvgSumCov = {} taxonIdToAvgCov = {} totalBp = 0.0 for taxonId in taxonIdToContigBp: taxonIdToTotalBp[taxonId] = 0.0 taxonIdToAvgSumCov[taxonId] = 0.0 taxonIdToAvgCov[taxonId] = 0.0 for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile): shortSeqId = getShortContigId(seqId) if shortSeqId in seqIdToBp: coverage = getCoverage(seqId) bp = seqIdToBp[shortSeqId] taxonId = seqIdToTaxonId[shortSeqId] taxonIdToTotalBp[taxonId] += bp taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp) totalBp += bp for taxonId, bp in taxonIdToTotalBp.iteritems(): if bp > 0: taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp) tupleList = [] taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True) ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:] avgCoverage = 0.0 for taxonId, readCount in taxonIdToReadCount.iteritems(): scName = ScientificNameAtRank(taxonId, taxonomy, ranks) tupleList.append(( taxonId, round(100 * (readCount / float(readTotalCount)), 1), round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)), 1), round(taxonIdToAvgCov.get(taxonId, 0), 2), round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2), taxonIdToContigCount.get(taxonId, 0), taxonomy.getScientificName(taxonId), scName.getNameAtRank('phylum'), scName.getNameAtRank('class'), scName.getNameAtRank('order'), scName.getNameAtRank('family'), scName.getNameAtRank('genus'), scName.getNameAtRank( 'species') # this could be done in a nicer way )) avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get( taxonId, 0) avgCoverage /= float(totalBp) tupleList.sort(key=lambda x: x[2], reverse=True) out = csv.OutFileBuffer(outProfileFile) out.writeText( '#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, ' + ",".join(ranks) + '\n') for entry in tupleList: out.writeText(','.join(map(str, entry)) + '\n') out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' + str(round(totalBp / 1000000.0, 2)) + ', ' + str(totalContigCount) + ', -\n') out.close() taxonomy.close()