def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile): """ Transforms a PPS output file into a file in the PP format. @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id) @param outFile: output file in the PP format @param taxonomicRanks: taxonomic ranks (starting from superkingdom) @param databaseFile: database file in the sqlite3 format """ taxonomy = Taxonomy(databaseFile, taxonomicRanks) outBuff = csv.OutFileBuffer(outFile) namesList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=0, sep='\t', comment='#') valCol = 1 ncbidsList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') while True: # this is not efficient! valCol += 1 tmpList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') if len(tmpList) == len(namesList): ncbidsList = tmpList else: break header = str('#PPS file transformed to PP format, input file: ' + str(inFile) + '\n#ID' + '\t' + 'root') for rank in taxonomicRanks: header += str('\t' + rank) outBuff.writeText(str(header + '\n')) for i in range(len(namesList)): name = namesList[i] ncbid = ncbidsList[i] taxPathDict = taxonomy.getPathToRoot(int(ncbid)) buff = str(name) if taxPathDict is None: buff += str('\t') else: buff += str('\t' + 'root') for rank in taxonomicRanks: if (taxPathDict is not None) and (rank in taxPathDict) and ( not taxPathDict[rank].isCopy()): buff += str('\t' + taxPathDict[rank].name) else: buff += '\t' outBuff.writeText(str(buff + '\n')) outBuff.close() taxonomy.close()
def removeLines(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.tax') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set( csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') out = csv.OutFileBuffer(dstFilePath) removed = 0 for col0, col1 in zip(col0, col1): if re.sub(pattern, r'\1', col0) not in removeSet: out.writeText(str(col0 + '\t' + col1 + '\n')) else: removed += 1 out.close() print mg, 'removeLines', removed
def removeEntries(mg): """ Removes sequences from the marker gene files at the level from species, genus, family etc. """ removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt' srcFilePath = str( '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.tax') out = csv.OutFileBuffer(dstFilePath) removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#')) removeSetInt = set() removeSetIds = set() removed = 0 for s in removeSet: if s != '': removeSetInt.add(int(s)) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') for col0, col1 in zip(col0, col1): lineSetInt = set() for s in col1.split(';'): if s != '': lineSetInt.add(int(s)) if len(removeSetInt.intersection( lineSetInt)) > 0: #the intersection is not empty removed += 1 removeSetIds.add(col0) else: out.writeText(str(col0 + '\t' + col1 + '\n')) out.close() print mg, 'removedEntries', removed srcFilePath = str( '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.noalign.fna') out = csv.OutFileBuffer(dstFilePath) seqIdToSeq = fas.fastaFileToDict(srcFilePath) removed = 0 for seqId in seqIdToSeq: if seqId in removeSetIds: removed += 1 else: out.writeText( str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) out.close() print mg, 'removedSeq', removed
def tmpCmp(): communityList = csv.getColumnAsList('/Users/ivan/Documents/work/binning/data/mercier51Strains/syn-mercier51strains/' 'generation/community_20121116.tax', colNum=1, sep='\t') profileList = csv.getColumnAsList('/Users/ivan/Documents/nobackup/assembly/uniform/soap_uniform.contig.profile.csv', colNum=0, sep=',') cSet = set(map(int, communityList)) pSet = set(map(int, profileList)) for i in cSet: if i not in pSet: print("Ncbid %s from community is not in profile" % i) for i in pSet: if i not in cSet: print("Ncbid %s from profile is not in community" % i)
def toContigsLabels(inMapFile, outMapFile): """ Creates the label of contigs from the label of reads. @param inMapFile: maps contigId to a list of read taxonIds @param outMapFile: maps contigId to weight and the most prevalent taxonId """ out = csv.OutFileBuffer(outMapFile) for line in csv.getColumnAsList(inMapFile, sep='\n'): contigId, taxonIds = str(line).split('\t') taxonIdsList = map(int, str(taxonIds).split(',')) idToCount = {} totalCount = 0.0 for taxonId in taxonIdsList: totalCount += 1 if taxonId in idToCount: idToCount[taxonId] += 1 else: idToCount[taxonId] = 1 pairList = [] for taxonId, count in idToCount.iteritems(): pairList.append((taxonId, count)) pairList.sort(key=lambda x: x[1], reverse=True) weight = round(float(pairList[0][1]) / totalCount, 3) out.writeText(str(contigId) + '\t' + str(weight) + '\t' + str(pairList[0][0]) + '\n') out.close()
def toContigsLabels(inMapFile, outMapFile): """ Creates the label of contigs from the label of reads. @param inMapFile: maps contigId to a list of read taxonIds @param outMapFile: maps contigId to weight and the most prevalent taxonId """ out = csv.OutFileBuffer(outMapFile) for line in csv.getColumnAsList(inMapFile, sep='\n'): contigId, taxonIds = str(line).split('\t') taxonIdsList = map(int, str(taxonIds).split(',')) idToCount = {} totalCount = 0.0 for taxonId in taxonIdsList: totalCount += 1 if taxonId in idToCount: idToCount[taxonId] += 1 else: idToCount[taxonId] = 1 pairList = [] for taxonId, count in idToCount.iteritems(): pairList.append((taxonId, count)) pairList.sort(key=lambda x: x[1], reverse=True) weight = round(float(pairList[0][1]) / totalCount, 3) out.writeText( str(contigId) + '\t' + str(weight) + '\t' + str(pairList[0][0]) + '\n') out.close()
def getReadsTaxonIdList(readsFile, communityFile, readHeaderToCommunityId=getCommunityId): """ Gets list of taxonIds in the same order as they are in the readOnContig file. The first taxonId is at index 1. @param readsFile: @param communityFile: @param readHeaderToCommunityId: @return: """ communityIdToTaxonId = csv.predToDict(communityFile) d = [None] rowList = csv.getColumnAsList(readsFile, colNum=0, sep='\n') for line in rowList: if str(line).startswith('>'): try: taxonId = int( communityIdToTaxonId.get(readHeaderToCommunityId(line))) except TypeError as ex: print(ex.message) print("%s, %s" % (taxonId, line)) raise ex d.append(taxonId) d.append(taxonId) return d
def removeSequences(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set( csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) seqIdToSeq = fas.fastaFileToDict(srcFilePath) out = csv.OutFileBuffer(dstFilePath) removed = 0 for seqId in seqIdToSeq: if re.sub(pattern, r'\1', str(seqId)) not in removeSet: out.writeText( str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) else: removed += 1 out.close() print mg, 'removeSequences', removed
def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile): """ Transforms a PPS output file into a file in the PP format. @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id) @param outFile: output file in the PP format @param taxonomicRanks: taxonomic ranks (starting from superkingdom) @param databaseFile: database file in the sqlite3 format """ taxonomy = Taxonomy(databaseFile, taxonomicRanks) outBuff = csv.OutFileBuffer(outFile) namesList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=0, sep='\t', comment='#') valCol = 1 ncbidsList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') while True: # this is not efficient! valCol += 1 tmpList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') if len(tmpList) == len(namesList): ncbidsList = tmpList else: break header = str('#PPS file transformed to PP format, input file: ' + str(inFile) + '\n#ID' + '\t' + 'root') for rank in taxonomicRanks: header += str('\t' + rank) outBuff.writeText(str(header + '\n')) for i in range(len(namesList)): name = namesList[i] ncbid = ncbidsList[i] taxPathDict = taxonomy.getPathToRoot(int(ncbid)) buff = str(name) if taxPathDict is None: buff += str('\t') else: buff += str('\t' + 'root') for rank in taxonomicRanks: if (taxPathDict is not None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()): buff += str('\t' + taxPathDict[rank].name) else: buff += '\t' outBuff.writeText(str(buff + '\n')) outBuff.close() taxonomy.close()
def removeEntries(mg): """ Removes sequences from the marker gene files at the level from species, genus, family etc. """ removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.tax') out = csv.OutFileBuffer(dstFilePath) removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#')) removeSetInt = set() removeSetIds = set() removed = 0 for s in removeSet: if s != '': removeSetInt.add(int(s)) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') for col0,col1 in zip(col0,col1): lineSetInt = set() for s in col1.split(';'): if s != '': lineSetInt.add(int(s)) if len(removeSetInt.intersection(lineSetInt)) > 0: #the intersection is not empty removed += 1 removeSetIds.add(col0) else: out.writeText(str(col0 + '\t' + col1 + '\n')) out.close() print mg, 'removedEntries', removed srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.noalign.fna') out = csv.OutFileBuffer(dstFilePath) seqIdToSeq = fas.fastaFileToDict(srcFilePath) removed=0 for seqId in seqIdToSeq: if seqId in removeSetIds: removed += 1 else: out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) out.close() print mg, 'removedSeq', removed
def tmpCmp(): communityList = csv.getColumnAsList( '/Users/ivan/Documents/work/binning/data/mercier51Strains/syn-mercier51strains/' 'generation/community_20121116.tax', colNum=1, sep='\t') profileList = csv.getColumnAsList( '/Users/ivan/Documents/nobackup/assembly/uniform/soap_uniform.contig.profile.csv', colNum=0, sep=',') cSet = set(map(int, communityList)) pSet = set(map(int, profileList)) for i in cSet: if i not in pSet: print("Ncbid %s from community is not in profile" % i) for i in pSet: if i not in cSet: print("Ncbid %s from profile is not in community" % i)
def removeLines(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.tax') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') out = csv.OutFileBuffer(dstFilePath) removed = 0 for col0,col1 in zip(col0,col1): if re.sub(pattern, r'\1', col0) not in removeSet: out.writeText(str(col0 + '\t' + col1 + '\n')) else: removed += 1 out.close() print mg, 'removeLines', removed
def genomesToMask(): rank = 'genus' #which rank will be masked fileName = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs_genus_ncbids.txt' outFile = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_genus_masked.txt' outFile2 = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_ncbids_genus.txt' #outFile = '/Users/ivan/Documents/work/binning/data/V35/genome_species_masked.txt' #output file #outFile2 = '/Users/ivan/Documents/work/binning/data/V35/genome_ncbids_species.txt' #output file #fileName='/Users/ivan/Documents/work/binning/data/V35/genome_ncbids.txt' #list of all genome ncbids dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db' #DB out = csv.OutFileBuffer(outFile) out2 = csv.OutFileBuffer(outFile2) genomeNcbids = csv.getColumnAsList(fileName, entryModifyFunction=None, colNum=0, sep=None, comment='#') taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile) maskNcbids = [] #print len(genomeNcbids), genomeNcbids for ncbid in genomeNcbids: while taxonomy.getRank(ncbid) != rank: ncbid = taxonomy.getParentNcbid(ncbid) if int(ncbid) == 1: print 'root reached!' break maskNcbids.append(int(ncbid)) #print len(Set(maskNcbids)), maskNcbids maskSet = set(maskNcbids) for i in maskSet: out2.writeText(str(str(i) + '\n')) resultList = [] for ncbid in maskSet: list = collectChildren(taxonomy, ncbid) for i in list: out.writeText(str(str(i) + '\n')) print ncbid, list #print taxonomy.childrenNcbids(818) #997888,818 out.close() out2.close() taxonomy.close()
def sortReads(inReadsFile, outReadsFile, headerToNum=lambda x: int(x.split('_', 2)[1].strip('nr'))): i = 0 seqName = None tupleList = [] for line in csv.getColumnAsList(inReadsFile, sep='\n'): if i % 2 == 0: seqName = line else: seq = line assert seqName is not None tupleList.append((seqName, zlib.compress(seq), headerToNum(seqName))) seqName = None i += 1 tupleList.sort(key=lambda x: x[2]) out = csv.OutFileBuffer(outReadsFile) for t in tupleList: out.writeText(str(t[0]) + '\n' + str(zlib.decompress(t[1])) + '\n') out.close()
def removeSequences(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) seqIdToSeq = fas.fastaFileToDict(srcFilePath) out = csv.OutFileBuffer(dstFilePath) removed = 0 for seqId in seqIdToSeq: if re.sub(pattern, r'\1', str(seqId)) not in removeSet: out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) else: removed += 1 out.close() print mg, 'removeSequences', removed
def sortReads(inReadsFile, outReadsFile, headerToNum=lambda x: int(x.split('_', 2)[1].strip('nr'))): i = 0 seqName = None tupleList = [] for line in csv.getColumnAsList(inReadsFile, sep='\n'): if i % 2 == 0: seqName = line else: seq = line assert seqName is not None tupleList.append( (seqName, zlib.compress(seq), headerToNum(seqName))) seqName = None i += 1 tupleList.sort(key=lambda x: x[2]) out = csv.OutFileBuffer(outReadsFile) for t in tupleList: out.writeText(str(t[0]) + '\n' + str(zlib.decompress(t[1])) + '\n') out.close()
def getReadsTaxonIdList(readsFile, communityFile, readHeaderToCommunityId=getCommunityId): """ Gets list of taxonIds in the same order as they are in the readOnContig file. The first taxonId is at index 1. @param readsFile: @param communityFile: @param readHeaderToCommunityId: @return: """ communityIdToTaxonId = csv.predToDict(communityFile) d = [None] rowList = csv.getColumnAsList(readsFile, colNum=0, sep='\n') for line in rowList: if str(line).startswith('>'): try: taxonId = int(communityIdToTaxonId.get(readHeaderToCommunityId(line))) except TypeError as ex: print(ex.message) print("%s, %s" % (taxonId, line)) raise ex d.append(taxonId) d.append(taxonId) return d
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile): """ Computes the training accuracy for the PPS training data. This function doesn't consider training data used to train intermediate (misc?) nodes! The training data that correspond to the sample specific data is fragmented (via PPS) and contained in the training data of different lengths. @param workingDir: working directory of the PPS+ pipeline @param taWorkingDir: working directory for the accuracy computation @param sampleSpecificDir: directory containing the sample specific data @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data @param outputDir: directory for output files @param ppsScripts: directory containing PPS scripts @param ppsConfigFilePath: the PPS configuration file @param ppsInstallDir: directory where PPS is installed @param predictLogFileName: logging file for PPS prediction @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled @param databaseFile: ncbi taxonomy file in the sqlite3 format """ for d in [ workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName) ]: assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]: assert os.path.isfile(f), "File '%s' doesn't exist!" % f # all directories that contain PPS training data trainDirList = [sampleSpecificDir] for d in os.listdir(ppsTrainDataDir): trainDirList.append(os.path.join(ppsTrainDataDir, d)) # fasta file with all training sequences allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna') out = csv.OutFileBuffer(allTrainFastaFile) seqIdToTruePred = {} # merge all training fasta files to one fasta file for d in trainDirList: dName = os.path.basename(d) for f in os.listdir(d): taxonId = int(os.path.basename(f).rsplit('.', 2)[0]) for seqId, seq in fasta.fastaFileToDict(os.path.join( d, f)).iteritems(): if d == sampleSpecificDir: #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) id = str( taxonId) + '|' + dName + '|' + seqId + '|label:' + str( taxonId) else: id = str(taxonId) + '|' + dName + '|' + seqId out.writeText('>' + id + '\n' + seq + '\n') seqIdToTruePred[id] = taxonId out.close() # predict the merged file using the generated model if os.name == 'posix': predictCmd = str( os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath) #print(predictCmd) logOut = open(predictLogFileName, 'w') predictProc = subprocess.Popen( predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut, stderr=subprocess.STDOUT) # stdout=subprocess.STDOUT predictProc.wait() logOut.close() if predictProc.returncode != 0: raise Exception( "PPS 'predict' training data returned with non-zero status: %s, cmd: %s" % (predictProc.returncode, predictCmd)) else: print("Can't run PPS on a non-posix system!") return # read in predicted train data seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out') # read fasta file seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile) # leaf taxonIds that are modelled modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath))) taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True) notLeafTaxonIds = set() for id in modelLeafTaxonIds: notLeafTaxonIds.update( set(map(int, (taxonomyS.getParentsNcbidSet(id))))) taxonomyS.close() # get only sequences with true taxonId defined at leaf level that is modelled or lower seqIdToBp2 = {} seqIdToPred2 = {} seqIdToTruePred2 = {} seqIdToBpMisc = {} seqIdToPredMisc = {} seqIdToTruePredMisc = {} for seqId, bp in seqIdToBp.iteritems(): label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) if label not in notLeafTaxonIds: seqIdToBp2[seqId] = bp seqIdToPred2[seqId] = seqIdToPred[seqId] seqIdToTruePred2[seqId] = seqIdToTruePred[seqId] else: seqIdToBpMisc[seqId] = bp seqIdToPredMisc[seqId] = seqIdToPred[seqId] seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId] seqIdToBp = seqIdToBp2 seqIdToPred = seqIdToPred2 seqIdToTruePred = seqIdToTruePred2 # accuracy for all, filter out sample specific data (whole length) seqIdToBpNoSampleSpec = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split( '|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip(): seqIdToBpNoSampleSpec[seqId] = bp acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() taxonomyA = acc.getTaxonomy() acc.close(closeTaxonomy=False) # accuracy for (misc) nodes acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() acc.close(closeTaxonomy=False) # generate the confusion matrices (for the "for all" scenario) cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_all')) taxonomyCM = cm.getTaxonomy() cm.close(closeTaxonomy=False) # accuracy for individual directories (seq lengths) # (the sample specific fragments are among PPS sampled fasta) for d in trainDirList: dName = os.path.basename(d) seqIdToBpSub = {} seqIdToPredSub = {} seqIdToTruePredSub = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() == str(dName).strip(): seqIdToBpSub[seqId] = seqIdToBp[seqId] seqIdToPredSub[seqId] = seqIdToPred[seqId] seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId] # accuracy acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA) out = csv.OutFileBuffer( os.path.join(outputDir, 'train_accuracy_' + dName + '.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) # confusion matrices cm = confusion_matrix.ConfusionMatrix( seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName)) cm.close(closeTaxonomy=False) out.close() acc.close(closeTaxonomy=False) taxonomyA.close() taxonomyCM.close()
def _main(): """ See the module description.""" parser = argparse.ArgumentParser(description=__doc__, epilog="""""") parser.add_argument( '-i', '--input-data-dir', action='store', nargs=1, required=True, help= """Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv") file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv") will be considered. (Takes only Bacteria and Archaea)""", metavar='input_dir', dest='inDir') parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True, help='Directory that contains the output files.', metavar='out_dir', dest='outDir') parser.add_argument( '-s', '--source-type', required=True, nargs=1, choices=["s", "a"], help= 'To determine the source, use "s" for the Silva database and "a" for the Amphora database.', dest='srcType') parser.add_argument( '-t', '--taxonomy-file', nargs=1, type=file, required=True, help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db', dest='taxonomy') parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1, help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)', metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\ ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\ '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\ '1077529,361146,511563,361147"', dest='filterOut') # parse arguments args = parser.parse_args() inDir = args.inDir[0] outDir = args.outDir[0] srcType = args.srcType[0] filterOutTaxonIdsSet = set() try: if args.filterOut: filterOutTaxonIdsSet.update( set(map(int, str(args.filterOut[0]).split(',')))) except: print( 'Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!' ) raise taxonomy = _TaxonomyWrap(args.taxonomy[0].name) for dir in [inDir, outDir]: assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!' # create db for each gene mapDict = {} # map: seqId -> ncbid for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')): # *.csv or *.tax assert mapFilePath.endswith(('.csv', '.tax')), \ 'The mapping files can either end with .csv or .tax ' + mapFilePath base = os.path.basename(mapFilePath).rsplit( '.', 1)[0] # cut out dir path and suffix fastaDict = fas.fastaFileToDict( os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq print("Processing: %s seq count: %s" % (base, str(len(fastaDict)))) if 'a' in srcType: # Amphora mapDict = {} for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'): v = int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid assert ((k not in mapDict) or (mapDict[k] == v)), str( 'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath) mapDict[k] = v elif 's' in srcType: # Silva mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t') mapDict = {} for k, v in mapTmp.iteritems(): mapDict[k] = int(v[0]) else: assert False, 'Unsupported source type!' # same number of entries in both files (fasta and mapping) ? if len(mapDict) != len(fastaDict): print( str('%s: The mapping file and the corresponding fasta file have different number of entries: ' + '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict)))) continue # are duplicates in the mapping file ? count = len(csv.getColumnAsList(mapFilePath)) if len(mapDict) != count: print( '%s: The mapping file contained duplicates! unique: %s non-unique: %s' % (base, str(len(mapDict)), str(count))) # store data to the output directory outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna'))) outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax'))) count = 0 filteredLeaf = 0 filteredSup = 0 notMapped = 0 noBacArch = 0 for seqId, taxonId in mapDict.iteritems(): if taxonId in filterOutTaxonIdsSet: filteredLeaf += 1 continue path = taxonomy.getPathToRoot(taxonId) if path is None: print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId)) notMapped += 1 continue topLevel = int(path.split(';', 1)[0]) if topLevel in filterOutTaxonIdsSet: filteredSup += 1 continue if topLevel not in [2, 2157]: # Bacteria, Archaea noBacArch += 1 print('NoBactArch: ', topLevel) seq = fastaDict[seqId] if 'a' in srcType: # Amphora id = seqId elif 's' in srcType: # Silva id = str(seqId + '|ncbid:' + str(taxonId)) outTax.writeText(str(id + '\t' + path + '\n')) outDna.writeText(str('>' + id + '\n' + seq + '\n')) count += 1 outDna.close() outTax.close() print( 'Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' % (count, filteredLeaf, filteredSup, notMapped)) if noBacArch > 0: print( 'WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch)) # Silva: #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ... # Amphora # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db taxonomy.close() print 'done'
def maskDb(action, inDir, outDir, rank, clades, taxonomyFilePath, verbose=False): """ Main function (function interface), see module description. @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg) @type action str @param inDir: directory containing input files @type inDir: str @param outDir: directory containing output files @type: outDir: str @param rank: the data will be excluded at this rank @type rank: str @param clades: a file containing clades that will be masked (one ncbi taxon id at a line), or a set of ncbi taxon ids that will be masked @type clades: file or set of int @param taxonomyFilePath: taxonomy database file in the sqlite3 format @type taxonomyFilePath: str """ # check input parameters assert action in ['cl', 'mr', 'mg'], str('Given action is not supported: ' + action) if action == 'mr': assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!' for dir in [inDir, outDir]: assert os.path.isdir(dir), str("Directory doesn't exists: " + dir) assert rank in _RANKS, str('Not supported rank: ' + rank) assert os.path.isfile(taxonomyFilePath), str( "Taxonomy database file doesn't exist: " + taxonomyFilePath) assert isinstance( clades, set ) or (isinstance(clades, str) and os.path.isfile(clades)), str( "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded." ) # maps a rank to a lower rank toLowerRank = {} for i in range(1, len(_RANKS)): toLowerRank[_RANKS[i - 1]] = _RANKS[i] taxonomy = _TaxonomyWrapMD(taxonomyFilePath) # leaf clades to mask if isinstance(clades, set): inCladesSet = set(map(int, clades)) else: inCladesSet = set(map(int, csv.getColumnAsList(clades))) # clades in the reference refCladesSet = set() if action in ['cl', 'mr']: # get the list of all taxon ids that appear in the directory (as PPS reference) for fastaFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna refCladesSet.add(_refFilePathToTaxonId( fastaFilePath)) # taxonId.1.fna or taxonId.1.fas elif action in ['mg']: # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax" for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.tax')): # *.tax refCladesSet.update( set( map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t')))) else: assert False, str('Not supported action: ' + action) # checks whether taxonIds are in the taxonomy for taxonId in inCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from clades list is not contained in the taxonomy!' % taxonId) for taxonId in refCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from the reference is not contained in the taxonomy!' % taxonId) # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level)) for taxonId in inCladesSet: if not taxonomy.isLeaf(taxonId): print( 'Taxon id %s does not represent a leaf clade in the taxonomy.' % taxonId) if verbose: print('Initial checks done.') # taxonIds that should be excluded toExcludeSet = set() for taxonId in inCladesSet: taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank) if taxonIdAtRank is None: # the lineage is not defined at this rank ! try a lower rank ! print('Taxon id: "%s" is not defined at rank: "%s"' % (taxonId, rank)) currentRank = rank # find a lower rank at which it's defined while currentRank in toLowerRank: currentRank = toLowerRank[currentRank] taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank) if taxonIdAtRank is not None: break if taxonIdAtRank is None: taxonIdAtRank = taxonId currentRank = _STRAIN print('Taxon id: %s will be masked at rank: %s' % (taxonId, currentRank)) # all child clades (and itself) toExcludeSet.add(int(taxonIdAtRank)) toExcludeSet.update( set(map(int, taxonomy.getAllChildren(taxonIdAtRank)))) # all clades that should be excluded (there is at least one sequence for each taxonId in the reference) toExcludeSet.intersection_update(refCladesSet) if verbose: print('Data to mask collected done.') print('To exclude: ', len(toExcludeSet)) # exclude data from the reference if action == 'cl': # generates a list of taxonIds out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt')) for taxonId in toExcludeSet: out.writeText(str(taxonId) + '\n') out.close() elif action == 'mr': # masked reference sequences (create sim links to files that were not excluded) for fastaFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna taxonId = _refFilePathToTaxonId( fastaFilePath) # taxonId.1.fna or taxonId.1.fas if taxonId not in toExcludeSet: # assert os.name == 'posix' os.symlink( fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath))) elif action == 'mg': # exclude sequences from the marker gene databases for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.tax')): # get entries that can stay in the mapping and fasta files allowedEntriesSet = set( map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t'))) allowedEntriesSet.difference_update(toExcludeSet) # filter out entries from the mapping file csv.filterOutLines(mapFilePath, os.path.join(outDir, os.path.basename(mapFilePath)), allowedEntriesSet, entryModifyFunction=_mgSeqIdToTaxonId, colNum=0, sep='\t') # filter out entries from the fasta file fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna') fas.filterOutSequences(fastaFilePath, os.path.join( outDir, os.path.basename(fastaFilePath)), allowedEntriesSet, seqNameModifyFunction=_mgSeqIdToTaxonId) else: assert False, 'Not supported action!' taxonomy.close() if verbose: print('Data masked done.')
def _main(): mergeS = False sortS = False clusterS = False filterOutSeq = True # this is optional, e.g. to remove plasmids # handle broken pipes signal.signal(signal.SIGPIPE, signal.SIG_DFL) #printStatDbk() #checkForPlasmids() # mapFilePathList = ['/local/johdro/refdata/static/ncbi-genomes-bacteria_20121122/nobackup/dna_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-contigs_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-scaffolds_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-contigs_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-scaffolds_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-refseq-microbial_56/nobackup/dna_acc.nuc.tax' # ] # fastaFilePathList = ['/local/johdro/refdata/static/ncbi-genomes-bacteria_20121122/nobackup/dna_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-contigs_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-scaffolds_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-contigs_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-scaffolds_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-refseq-microbial_56/nobackup/dna_acc.nuc.fna' # ] # input files mapFilePathList = ['/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-genomes-bacteria_20140428/dna_acc.tax', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-draftgenomes-bacteria_20140508/dna-scaffolds_acc.tax', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-contigs_acc.tax', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-scaffolds_acc.tax', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-refseq-microbial_64/dna_acc.tax' ] fastaFilePathList = ['/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-genomes-bacteria_20140428/dna_acc.fna', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-draftgenomes-bacteria_20140508/dna-scaffolds_acc.fna', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-contigs_acc.fna', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-scaffolds_acc.fna', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-refseq-microbial_64/dna_acc.fna' ] # output dirs mergedDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/merged' # '/local/igregor/ref_20121122/nobackup/merged' sortedDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/sorted' # '/local/igregor/ref_20121122/nobackup/sorted' centroidsDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids' # '/local/igregor/ref_20121122/nobackup/centroids_1_0' # clustersDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/clusters' # '/local/igregor/ref_20121122/nobackup/clusters_1_0' # tools usearch5 = '/net/metagenomics/projects/PPSmg/tools/usearch/usearch5.2.32_i86linux32' usearch6 = '/net/metagenomics/projects/PPSmg/tools/usearch/usearch6.0.307_i86linux32' # dnaClust = '/net/metagenomics/projects/PPSmg/tools/dnaclust_64bit_parallel/dnaclust' # merge sequences from multiple files if mergeS: mergeSequences(mapFilePathList, fastaFilePathList, mergedDir) if sortS: sortSeqDesc(usearch5, usearch6, mergedDir, sortedDir, mapFilePathList) # sort and cluster sequences if clusterS: toCentroids(sortedDir, centroidsDir, mapFilePathList) move(centroidsDir) if filterOutSeq: taxonIdSet = getAllTaxonIdSet(mapFilePathList) srcDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids' # '/local/igregor/ref_20121122/nobackup/centroids_1_0' dstDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids_noplasmids' # '/local/igregor/ref_20121122/nobackup/centroids_1_0_no_plasmids' notAllowedSet = set(csv.getColumnAsList('/net/refdata/static/nonredundant-microbial_20140513/nobackup/plasmids_accessions.txt', colNum=0)) # /local/igregor/ref_20121122/nobackup/plasmid_accessions2.txt filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSet)
def getAllTaxonIdSet(mapFilePathList): taxonIdSet = set() for mapFile in mapFilePathList: taxonIdSet = taxonIdSet.union(set(csv.getColumnAsList(mapFile, colNum=1, sep='\t'))) return taxonIdSet
def _main(): """ See the module description.""" parser = argparse.ArgumentParser(description=__doc__, epilog="""""") parser.add_argument('-i', '--input-data-dir', action='store', nargs=1, required=True, help="""Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv") file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv") will be considered. (Takes only Bacteria and Archaea)""", metavar='input_dir', dest='inDir') parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True, help='Directory that contains the output files.', metavar='out_dir', dest='outDir') parser.add_argument('-s', '--source-type', required=True, nargs=1, choices=["s","a"], help='To determine the source, use "s" for the Silva database and "a" for the Amphora database.', dest='srcType') parser.add_argument('-t', '--taxonomy-file', nargs=1, type=file, required=True, help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db', dest='taxonomy') parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1, help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)', metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\ ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\ '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\ '1077529,361146,511563,361147"', dest='filterOut') # parse arguments args = parser.parse_args() inDir = args.inDir[0] outDir = args.outDir[0] srcType = args.srcType[0] filterOutTaxonIdsSet = set() try: if args.filterOut: filterOutTaxonIdsSet.update(set(map(int, str(args.filterOut[0]).split(',')))) except: print('Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!') raise taxonomy = TaxonomyWrap(args.taxonomy[0].name) for dir in [inDir, outDir]: assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!' # create db for each gene mapDict = {} # map: seqId -> ncbid for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')): # *.csv or *.tax assert mapFilePath.endswith(('.csv', '.tax')), \ 'The mapping files can either end with .csv or .tax ' + mapFilePath base = os.path.basename(mapFilePath).rsplit('.', 1)[0] # cut out dir path and suffix fastaDict = fas.fastaFileToDict(os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq print("Processing: %s seq count: %s" % (base, str(len(fastaDict)))) if 'a' in srcType: # Amphora mapDict = {} for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'): v = int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid assert ((k not in mapDict) or (mapDict[k] == v)), str( 'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath) mapDict[k] = v elif 's' in srcType: # Silva mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t') mapDict = {} for k, v in mapTmp.iteritems(): mapDict[k] = int(v[0]) else: assert False, 'Unsupported source type!' # same number of entries in both files (fasta and mapping) ? if len(mapDict) != len(fastaDict): print(str('%s: The mapping file and the corresponding fasta file have different number of entries: ' + '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict)))) continue # are duplicates in the mapping file ? count = len(csv.getColumnAsList(mapFilePath)) if len(mapDict) != count: print('%s: The mapping file contained duplicates! unique: %s non-unique: %s' % ( base, str(len(mapDict)), str(count))) # store data to the output directory outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna'))) outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax'))) count = 0 filteredLeaf = 0 filteredSup = 0 notMapped = 0 noBacArch = 0 for seqId, taxonId in mapDict.iteritems(): if taxonId in filterOutTaxonIdsSet: filteredLeaf += 1 continue path = taxonomy.getPathToRoot(taxonId) if path is None: print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId)) notMapped += 1 continue topLevel = int(path.split(';', 1)[0]) if topLevel in filterOutTaxonIdsSet: filteredSup += 1 continue if topLevel not in [2, 2157]: # Bacteria, Archaea noBacArch += 1 print('NoBactArch: ', topLevel) seq = fastaDict[seqId] if 'a' in srcType: # Amphora id = seqId elif 's' in srcType: # Silva id = str(seqId + '|ncbid:' + str(taxonId)) outTax.writeText(str(id + '\t' + path + '\n')) outDna.writeText(str('>' + id + '\n' + seq + '\n')) count += 1 outDna.close() outTax.close() print('Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' % (count, filteredLeaf, filteredSup, notMapped)) if noBacArch > 0: print('WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch)) # Silva: #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ... # Amphora # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db taxonomy.close() print 'done'
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile): """ Computes the training accuracy for the PPS training data. This function doesn't consider training data used to train intermediate (misc?) nodes! The training data that correspond to the sample specific data is fragmented (via PPS) and contained in the training data of different lengths. @param workingDir: working directory of the PPS+ pipeline @param taWorkingDir: working directory for the accuracy computation @param sampleSpecificDir: directory containing the sample specific data @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data @param outputDir: directory for output files @param ppsScripts: directory containing PPS scripts @param ppsConfigFilePath: the PPS configuration file @param ppsInstallDir: directory where PPS is installed @param predictLogFileName: logging file for PPS prediction @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled @param databaseFile: ncbi taxonomy file in the sqlite3 format """ for d in [workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName)]: assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]: assert os.path.isfile(f), "File '%s' doesn't exist!" % f # all directories that contain PPS training data trainDirList = [sampleSpecificDir] for d in os.listdir(ppsTrainDataDir): trainDirList.append(os.path.join(ppsTrainDataDir, d)) # fasta file with all training sequences allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna') out = csv.OutFileBuffer(allTrainFastaFile) seqIdToTruePred = {} # merge all training fasta files to one fasta file for d in trainDirList: dName = os.path.basename(d) for f in os.listdir(d): taxonId = int(os.path.basename(f).rsplit('.', 2)[0]) for seqId, seq in fasta.fastaFileToDict(os.path.join(d, f)).iteritems(): if d == sampleSpecificDir: #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) id = str(taxonId) + '|' + dName + '|' + seqId + '|label:' + str(taxonId) else: id = str(taxonId) + '|' + dName + '|' + seqId out.writeText('>' + id + '\n' + seq + '\n') seqIdToTruePred[id] = taxonId out.close() # predict the merged file using the generated model if os.name == 'posix': predictCmd = str(os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath) #print(predictCmd) logOut = open(predictLogFileName, 'w') predictProc = subprocess.Popen(predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut, stderr=subprocess.STDOUT) # stdout=subprocess.STDOUT predictProc.wait() logOut.close() if predictProc.returncode != 0: raise Exception("PPS 'predict' training data returned with non-zero status: %s, cmd: %s" % (predictProc.returncode, predictCmd)) else: print("Can't run PPS on a non-posix system!") return # read in predicted train data seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out') # read fasta file seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile) # leaf taxonIds that are modelled modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath))) taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True) notLeafTaxonIds = set() for id in modelLeafTaxonIds: notLeafTaxonIds.update(set(map(int, (taxonomyS.getParentsNcbidSet(id))))) taxonomyS.close() # get only sequences with true taxonId defined at leaf level that is modelled or lower seqIdToBp2 = {} seqIdToPred2 = {} seqIdToTruePred2 = {} seqIdToBpMisc = {} seqIdToPredMisc = {} seqIdToTruePredMisc = {} for seqId, bp in seqIdToBp.iteritems(): label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) if label not in notLeafTaxonIds: seqIdToBp2[seqId] = bp seqIdToPred2[seqId] = seqIdToPred[seqId] seqIdToTruePred2[seqId] = seqIdToTruePred[seqId] else: seqIdToBpMisc[seqId] = bp seqIdToPredMisc[seqId] = seqIdToPred[seqId] seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId] seqIdToBp = seqIdToBp2 seqIdToPred = seqIdToPred2 seqIdToTruePred = seqIdToTruePred2 # accuracy for all, filter out sample specific data (whole length) seqIdToBpNoSampleSpec = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip(): seqIdToBpNoSampleSpec[seqId] = bp acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() taxonomyA = acc.getTaxonomy() acc.close(closeTaxonomy=False) # accuracy for (misc) nodes acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() acc.close(closeTaxonomy=False) # generate the confusion matrices (for the "for all" scenario) cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_all')) taxonomyCM = cm.getTaxonomy() cm.close(closeTaxonomy=False) # accuracy for individual directories (seq lengths) # (the sample specific fragments are among PPS sampled fasta) for d in trainDirList: dName = os.path.basename(d) seqIdToBpSub = {} seqIdToPredSub = {} seqIdToTruePredSub = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() == str(dName).strip(): seqIdToBpSub[seqId] = seqIdToBp[seqId] seqIdToPredSub[seqId] = seqIdToPred[seqId] seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId] # accuracy acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_' + dName + '.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) # confusion matrices cm = confusion_matrix.ConfusionMatrix(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName)) cm.close(closeTaxonomy=False) out.close() acc.close(closeTaxonomy=False) taxonomyA.close() taxonomyCM.close()
def getAllTaxonIdSet(mapFilePathList): taxonIdSet = set() for mapFile in mapFilePathList: taxonIdSet = taxonIdSet.union( set(csv.getColumnAsList(mapFile, colNum=1, sep='\t'))) return taxonIdSet
def _main(): mergeS = False sortS = False clusterS = False filterOutSeq = True # this is optional, e.g. to remove plasmids # handle broken pipes signal.signal(signal.SIGPIPE, signal.SIG_DFL) #printStatDbk() #checkForPlasmids() # mapFilePathList = ['/local/johdro/refdata/static/ncbi-genomes-bacteria_20121122/nobackup/dna_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-contigs_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-scaffolds_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-contigs_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-scaffolds_acc.nuc.tax', # '/local/johdro/refdata/static/ncbi-refseq-microbial_56/nobackup/dna_acc.nuc.tax' # ] # fastaFilePathList = ['/local/johdro/refdata/static/ncbi-genomes-bacteria_20121122/nobackup/dna_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-contigs_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-scaffolds_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-contigs_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-scaffolds_acc.nuc.fna', # '/local/johdro/refdata/static/ncbi-refseq-microbial_56/nobackup/dna_acc.nuc.fna' # ] # input files mapFilePathList = [ '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-genomes-bacteria_20140428/dna_acc.tax', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-draftgenomes-bacteria_20140508/dna-scaffolds_acc.tax', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-contigs_acc.tax', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-scaffolds_acc.tax', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-refseq-microbial_64/dna_acc.tax' ] fastaFilePathList = [ '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-genomes-bacteria_20140428/dna_acc.fna', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-draftgenomes-bacteria_20140508/dna-scaffolds_acc.fna', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-contigs_acc.fna', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-scaffolds_acc.fna', '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-refseq-microbial_64/dna_acc.fna' ] # output dirs mergedDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/merged' # '/local/igregor/ref_20121122/nobackup/merged' sortedDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/sorted' # '/local/igregor/ref_20121122/nobackup/sorted' centroidsDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids' # '/local/igregor/ref_20121122/nobackup/centroids_1_0' # clustersDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/clusters' # '/local/igregor/ref_20121122/nobackup/clusters_1_0' # tools usearch5 = '/net/metagenomics/projects/PPSmg/tools/usearch/usearch5.2.32_i86linux32' usearch6 = '/net/metagenomics/projects/PPSmg/tools/usearch/usearch6.0.307_i86linux32' # dnaClust = '/net/metagenomics/projects/PPSmg/tools/dnaclust_64bit_parallel/dnaclust' # merge sequences from multiple files if mergeS: mergeSequences(mapFilePathList, fastaFilePathList, mergedDir) if sortS: sortSeqDesc(usearch5, usearch6, mergedDir, sortedDir, mapFilePathList) # sort and cluster sequences if clusterS: toCentroids(sortedDir, centroidsDir, mapFilePathList) move(centroidsDir) if filterOutSeq: taxonIdSet = getAllTaxonIdSet(mapFilePathList) srcDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids' # '/local/igregor/ref_20121122/nobackup/centroids_1_0' dstDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids_noplasmids' # '/local/igregor/ref_20121122/nobackup/centroids_1_0_no_plasmids' notAllowedSet = set( csv.getColumnAsList( '/net/refdata/static/nonredundant-microbial_20140513/nobackup/plasmids_accessions.txt', colNum=0) ) # /local/igregor/ref_20121122/nobackup/plasmid_accessions2.txt filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSet)
def maskDb(action, inDir, outDir, rank, clades, taxonomyFilePath, verbose=False): """ Main function (function interface), see module description. @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg) @type action str @param inDir: directory containing input files @type inDir: str @param outDir: directory containing output files @type: outDir: str @param rank: the data will be excluded at this rank @type rank: str @param clades: a file containing clades that will be masked (one ncbi taxon id at a line), or a set of ncbi taxon ids that will be masked @type clades: file or set of int @param taxonomyFilePath: taxonomy database file in the sqlite3 format @type taxonomyFilePath: str """ # check input parameters assert action in ['cl', 'mr', 'mg'], str('Given action is not supported: ' + action) if action == 'mr': assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!' for dir in [inDir, outDir]: assert os.path.isdir(dir), str("Directory doesn't exists: " + dir) assert rank in _RANKS, str('Not supported rank: ' + rank) assert os.path.isfile(taxonomyFilePath), str("Taxonomy database file doesn't exist: " + taxonomyFilePath) assert isinstance(clades, set) or (isinstance(clades, str) and os.path.isfile(clades)), str( "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded.") # maps a rank to a lower rank toLowerRank = {} for i in range(1, len(_RANKS)): toLowerRank[_RANKS[i-1]] = _RANKS[i] taxonomy = _TaxonomyWrapMD(taxonomyFilePath) # leaf clades to mask if isinstance(clades, set): inCladesSet = set(map(int, clades)) else: inCladesSet = set(map(int, csv.getColumnAsList(clades))) # clades in the reference refCladesSet = set() if action in ['cl', 'mr']: # get the list of all taxon ids that appear in the directory (as PPS reference) for fastaFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna refCladesSet.add(_refFilePathToTaxonId(fastaFilePath)) # taxonId.1.fna or taxonId.1.fas elif action in ['mg']: # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax" for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.tax')): # *.tax refCladesSet.update(set(map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t')))) else: assert False, str('Not supported action: ' + action) # checks whether taxonIds are in the taxonomy for taxonId in inCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from clades list is not contained in the taxonomy!' % taxonId) for taxonId in refCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from the reference is not contained in the taxonomy!' % taxonId) # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level)) for taxonId in inCladesSet: if not taxonomy.isLeaf(taxonId): print('Taxon id %s does not represent a leaf clade in the taxonomy.' % taxonId) if verbose: print('Initial checks done.') # taxonIds that should be excluded toExcludeSet = set() for taxonId in inCladesSet: taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank) if taxonIdAtRank is None: # the lineage is not defined at this rank ! try a lower rank ! print('Taxon id: "%s" is not defined at rank: "%s"' % (taxonId, rank)) currentRank = rank # find a lower rank at which it's defined while currentRank in toLowerRank: currentRank = toLowerRank[currentRank] taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank) if taxonIdAtRank is not None: break if taxonIdAtRank is None: taxonIdAtRank = taxonId currentRank = _STRAIN print('Taxon id: %s will be masked at rank: %s' % (taxonId, currentRank)) # all child clades (and itself) toExcludeSet.add(int(taxonIdAtRank)) toExcludeSet.update(set(map(int, taxonomy.getAllChildren(taxonIdAtRank)))) # all clades that should be excluded (there is at least one sequence for each taxonId in the reference) toExcludeSet.intersection_update(refCladesSet) if verbose: print('Data to mask collected done.') print('To exclude: ', len(toExcludeSet)) # exclude data from the reference if action == 'cl': # generates a list of taxonIds out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt')) for taxonId in toExcludeSet: out.writeText(str(taxonId) + '\n') out.close() elif action == 'mr': # masked reference sequences (create sim links to files that were not excluded) for fastaFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna taxonId = _refFilePathToTaxonId(fastaFilePath) # taxonId.1.fna or taxonId.1.fas if taxonId not in toExcludeSet: # assert os.name == 'posix' os.symlink(fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath))) elif action == 'mg': # exclude sequences from the marker gene databases for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.tax')): # get entries that can stay in the mapping and fasta files allowedEntriesSet = set(map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t'))) allowedEntriesSet.difference_update(toExcludeSet) # filter out entries from the mapping file csv.filterOutLines(mapFilePath, os.path.join(outDir, os.path.basename(mapFilePath)), allowedEntriesSet, entryModifyFunction=_mgSeqIdToTaxonId, colNum=0, sep='\t') # filter out entries from the fasta file fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna') fas.filterOutSequences(fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath)), allowedEntriesSet, seqNameModifyFunction=_mgSeqIdToTaxonId) else: assert False, 'Not supported action!' taxonomy.close() if verbose: print('Data masked done.')