def _getLabelsCreateFasta(): """ To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference genomes. Output a fasta file and a mapping file. :rtype : None """ # input fasta file fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt' #contigs_1000.txt seqIdToSeq = fas.fastaFileToDict(fastaFilePath) # contigs mapped to genome names nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt' #contigs_1000_blast_labels.txt seqIdToNameLabels = csv.getMapping(nameLabelsFilePath, 0, 1, sep='\t', comment='#') # mapping: genome name -> taxon id genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt' #genome_list.txt nameLabelToNcbid = csv.getMapping(genomeListFilePath, 0, 2, sep=';', comment='#') # to store mapped sequences outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' #contigsMappedBlast1000.fna outFasta = csv.OutFileBuffer(outFastaFilePath) # to stored taxonomic mapping of mapped sequences outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' #contigsMappedBlast1000Labels.txt outLabels = csv.OutFileBuffer(outLabelsFilePath) for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: outFasta.writeText( str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n')) outFasta.close() print 'fasta created' for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: nameLabel = seqIdToNameLabels[seqId][0] ncbid = nameLabelToNcbid[nameLabel][0] outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n')) outLabels.close() print 'labels created'
def scafToContigOutput(scaffContigMapFile, scaffPPSOutFile, contigPPSOutFile): """ Takes scaffold-contigs mapping and scaffold placement (.out file), outputs contigs placement (.out file) @param scaffContigMapFile: tab sepparated scaffold-contigs mapping (scaffoldName \t contigName) @param scaffPPSOutFile: scaffold predictions (PPS output file) @param contigPPSOutFile: contigs predictions (as if it was a PPS output file) """ # init output out = csv.OutFileBuffer(contigPPSOutFile) # read scaffold predictions scaffNameToTaxonId = csv.predToDict(scaffPPSOutFile) # read mapping: scaffName -> contigNameList scaffNameToContigNameList = csv.getMapping(scaffContigMapFile, 0, 1, sep='\t') # store contigs' predictions (according to scaffolds' predictions) for scaffName, contigNameList in scaffNameToContigNameList.iteritems(): taxonId = scaffNameToTaxonId.get(scaffName, None) if taxonId is None: taxonId = 1 for contigName in contigNameList: out.writeText(contigName + '\t' + str(taxonId) + '\n') out.close()
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig, community, outMappingFileName): """ Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs. @param inFastaFileName: @param readsF: @param readsR: @param readOnContig: @param community: @param outMappingFileName: """ # contigIds contigIdToBp = fas.getSequenceToBpDict(inFastaFileName) # map: contigId -> list of readIds contigIdToReadList = csv.getMapping(readOnContig, 1, 0, sep='\t', comment='r') # taxonIds as a list for reads readFTaxonIdList = getReadsTaxonIdList(readsF, community) print 's1' readRTaxonIdList = getReadsTaxonIdList(readsR, community) print 's2' if len(readFTaxonIdList) != len(readRTaxonIdList): print( 'toContigsLabels: different number of reads in the reads files, exit' ) return for i in range(len(readFTaxonIdList))[1:]: if readFTaxonIdList[i] != readRTaxonIdList[i]: print( 'toContigsLabels: at index %s different taxon ids %s and %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i])) if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None: print('toContigsLabels: at index %s, one is None %s or %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i])) print 's3' # out = csv.OutFileBuffer(outMappingFileName) for contigId in contigIdToBp: try: readList = contigIdToReadList[contigId] taxonIdList = [] for readId in readList: taxonIdList.append(readFTaxonIdList[int(readId)]) out.writeText( str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n') except KeyError: print("No label for contigId: %s" % contigId) out.close() print 's4'
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir): """ Reads all sequences. For each taxonId creates a file that contain all sequences mapped to this taxonId. If a seqId appears more than one it is ignored since acession numbers are unique. @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq """ taxonIdToOutBuffer = {} seqIdSet = set() totalSeqCount = 0 totalStoredSeqCount = 0 totalIdenticalSeqCount = 0 for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList): print 'processing', mapFilePath, fastaFilePath seqCount = 0 storedSeqCount = 0 seqIdToSeq = fasta.fastaFileToDict(fastaFilePath) seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#') for seqId, seq in seqIdToSeq.iteritems(): seqCount += 1 if seqId in seqIdSet: totalIdenticalSeqCount += 1 continue else: seqIdSet.add(seqId) taxonId = seqIdToNcbidList[seqId][0] if taxonId not in taxonIdToOutBuffer: outBuffer = csv.OutFileBuffer(os.path.join(outputDir, str(str(taxonId) + '.fna'))) taxonIdToOutBuffer[taxonId] = outBuffer taxonIdToOutBuffer[taxonId].writeText(str('>' + seqId + '\n' + seq + '\n')) taxonIdToOutBuffer[taxonId].close() storedSeqCount += 1 if len(string.replace(common.noNewLine(seq),'N','')) == 0: print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq)) # for buff in taxonIdToOutBuffer.values(): # buff.close() print 'totalSeq, storedSeq', seqCount, storedSeqCount totalSeqCount += seqCount totalStoredSeqCount += storedSeqCount print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount print 'sequences merged'
def toWellMappedContigs(inFastaFile, inTaxonomyWFile, outFastaFile, outFastaMisAssembledFile, outTaxonomyFile, weightThreshold=0.99): """ Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs). @param inFastaFile: input fasta file with contigs @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId) @param outFastaFile: fasta file containing well assembled sequences @param outFastaMisAssembledFile: fasta file containing misassembled contigs @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId) @param weightThreshold: only contigs the weight of which is at least this value will be taken @return: statistics """ seqIdToTaxonId = csv.predToDict(inTaxonomyWFile) seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t') outFastaOk = csv.OutFileBuffer(outFastaFile) outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile) outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile) totalBp = 0.0 totalCount = 0.0 okBp = 0.0 okCount = 0.0 avgSumBp = 0.0 for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems(): bp = len(seq) totalBp += bp totalCount += 1 seqIdPrefix = str(seqId).split(' ')[0] weight = seqIdToWeight[seqIdPrefix][0] fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n' if float(weight) >= weightThreshold: outFastaOk.writeText(fastaEntry) outTaxonomyOk.writeText( str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) + '\n') okBp += bp okCount += 1 avgSumBp += getCoverage(seqId) * bp else: outFastaMis.writeText(fastaEntry) outFastaOk.close() outFastaMis.close() outTaxonomyOk.close() return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % ( round(okBp / 1000000, 2), round(totalBp / 1000000, 2), okCount, totalCount, round( (okBp / totalBp) * 100, 2), round( (okCount / totalCount) * 100, 2), round(avgSumBp / okBp, 3))
def _getLabelsCreateFasta(): """ To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference genomes. Output a fasta file and a mapping file. :rtype : None """ # input fasta file fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt' #contigs_1000.txt seqIdToSeq = fas.fastaFileToDict(fastaFilePath) # contigs mapped to genome names nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt' #contigs_1000_blast_labels.txt seqIdToNameLabels = csv.getMapping(nameLabelsFilePath, 0, 1, sep='\t', comment = '#') # mapping: genome name -> taxon id genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt' #genome_list.txt nameLabelToNcbid = csv.getMapping(genomeListFilePath, 0, 2, sep=';', comment = '#') # to store mapped sequences outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' #contigsMappedBlast1000.fna outFasta = csv.OutFileBuffer(outFastaFilePath) # to stored taxonomic mapping of mapped sequences outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' #contigsMappedBlast1000Labels.txt outLabels = csv.OutFileBuffer(outLabelsFilePath) for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: outFasta.writeText(str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n')) outFasta.close() print 'fasta created' for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: nameLabel = seqIdToNameLabels[seqId][0] ncbid = nameLabelToNcbid[nameLabel][0] outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n')) outLabels.close() print 'labels created'
def toWellMappedContigs(inFastaFile, inTaxonomyWFile, outFastaFile, outFastaMisAssembledFile, outTaxonomyFile, weightThreshold=0.99): """ Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs). @param inFastaFile: input fasta file with contigs @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId) @param outFastaFile: fasta file containing well assembled sequences @param outFastaMisAssembledFile: fasta file containing misassembled contigs @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId) @param weightThreshold: only contigs the weight of which is at least this value will be taken @return: statistics """ seqIdToTaxonId = csv.predToDict(inTaxonomyWFile) seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t') outFastaOk = csv.OutFileBuffer(outFastaFile) outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile) outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile) totalBp = 0.0 totalCount = 0.0 okBp = 0.0 okCount = 0.0 avgSumBp = 0.0 for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems(): bp = len(seq) totalBp += bp totalCount += 1 seqIdPrefix = str(seqId).split(' ')[0] weight = seqIdToWeight[seqIdPrefix][0] fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n' if float(weight) >= weightThreshold: outFastaOk.writeText(fastaEntry) outTaxonomyOk.writeText(str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) + '\n') okBp += bp okCount += 1 avgSumBp += getCoverage(seqId) * bp else: outFastaMis.writeText(fastaEntry) outFastaOk.close() outFastaMis.close() outTaxonomyOk.close() return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % (round(okBp / 1000000, 2), round(totalBp / 1000000, 2), okCount, totalCount, round((okBp / totalBp) * 100, 2), round((okCount / totalCount) * 100, 2), round(avgSumBp / okBp, 3))
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig, community, outMappingFileName): """ Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs. @param inFastaFileName: @param readsF: @param readsR: @param readOnContig: @param community: @param outMappingFileName: """ # contigIds contigIdToBp = fas.getSequenceToBpDict(inFastaFileName) # map: contigId -> list of readIds contigIdToReadList = csv.getMapping(readOnContig, 1, 0, sep='\t', comment='r') # taxonIds as a list for reads readFTaxonIdList = getReadsTaxonIdList(readsF, community) print 's1' readRTaxonIdList = getReadsTaxonIdList(readsR, community) print 's2' if len(readFTaxonIdList) != len(readRTaxonIdList): print('toContigsLabels: different number of reads in the reads files, exit') return for i in range(len(readFTaxonIdList))[1:]: if readFTaxonIdList[i] != readRTaxonIdList[i]: print('toContigsLabels: at index %s different taxon ids %s and %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i] )) if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None: print('toContigsLabels: at index %s, one is None %s or %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i])) print 's3' # out = csv.OutFileBuffer(outMappingFileName) for contigId in contigIdToBp: try: readList = contigIdToReadList[contigId] taxonIdList = [] for readId in readList: taxonIdList.append(readFTaxonIdList[int(readId)]) out.writeText(str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n') except KeyError: print("No label for contigId: %s" % contigId) out.close() print 's4'
def filterSequences(): """ To filter sequences with a specific label. """ inFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' outFileName = '/net/metagenomics/projects/PPSmg/data/V35/nostocRemoved/contigsMappedBlast1000NostocRm.fna' mapFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' labelRemove = 103690 #seq id -> label labelToIdsDict = csv.getMapping(mapFileName, 1, 0, sep='\t', comment = '#') allowedNamesSet = set() for i in labelToIdsDict: if int(i) != int(labelRemove): for j in labelToIdsDict[i]: allowedNamesSet.add(j) fas.filterOutSequences(inFileName, outFileName, allowedNamesSet)
def filterSequences(): """ To filter sequences with a specific label. """ inFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' outFileName = '/net/metagenomics/projects/PPSmg/data/V35/nostocRemoved/contigsMappedBlast1000NostocRm.fna' mapFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' labelRemove = 103690 #seq id -> label labelToIdsDict = csv.getMapping(mapFileName, 1, 0, sep='\t', comment='#') allowedNamesSet = set() for i in labelToIdsDict: if int(i) != int(labelRemove): for j in labelToIdsDict[i]: allowedNamesSet.add(j) fas.filterOutSequences(inFileName, outFileName, allowedNamesSet)
def samToMap(samFile, accToNcbiFile, outMapFile): """ @param samFile: sam file from an assembler @param accToNcbiFile: mapping: accessions -> ncbi taxon ids @param outMapFile: output file or directory """ accToNcbi = csv.getMapping(accToNcbiFile, 0, 1, sep='\t') contigToAcc = parseSam(samFile) out = csv.OutFileBuffer(outMapFile) for contigId, acc in contigToAcc.iteritems(): taxonId = accToNcbi.get(acc, None) if taxonId is None: print("No mapping for %s %s" % (contigId, acc)) else: out.writeText(contigId + '\t' + taxonId[0] + '\n') out.close()
def getFirstLabelAtAllowedRank(): rank='species' # !!!!!!! predFile1 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' predFile2 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt' seqIdToLabel = csv.getMapping(predFile1, 0, 1, sep='\t', comment = '#') outPred = csv.OutFileBuffer(predFile2) taxonomy = tax.TaxonomyNcbi('/net/metagenomics/projects/PPSmg/data/nobackup/NCBI20120828/ncbiTax/ncbitax_sqlite.db') for seqId in seqIdToLabel: ncbid = int(seqIdToLabel[seqId][0]) while not taxonomy.isRankNcbidAllowed(ncbid): ncbid = taxonomy.getParentNcbid(ncbid) outPred.writeText(str(seqId + '\t' + str(ncbid) + '\n')) taxonomy.close() outPred.close()
def getFirstLabelAtAllowedRank(): rank = 'species' # !!!!!!! predFile1 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' predFile2 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt' seqIdToLabel = csv.getMapping(predFile1, 0, 1, sep='\t', comment='#') outPred = csv.OutFileBuffer(predFile2) taxonomy = tax.TaxonomyNcbi( '/net/metagenomics/projects/PPSmg/data/nobackup/NCBI20120828/ncbiTax/ncbitax_sqlite.db' ) for seqId in seqIdToLabel: ncbid = int(seqIdToLabel[seqId][0]) while not taxonomy.isRankNcbidAllowed(ncbid): ncbid = taxonomy.getParentNcbid(ncbid) outPred.writeText(str(seqId + '\t' + str(ncbid) + '\n')) taxonomy.close() outPred.close()
def _main(): # define arguments parser = argparse.ArgumentParser( description='Default task: PPS+ evaluation', epilog='') parser.add_argument( '-b', '--cont-binning-file', nargs=1, type=file, required=True, help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b') parser.add_argument( '-t', '--cont-true-binning-file', nargs=1, type=file, required=True, help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t') parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False, help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f') parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False, help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m') parser.add_argument( '-n', '--cont-ncbi-taxonomy', nargs=1, required=False, help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir', dest='n') parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True, help='Output directory.', metavar='output_dir', dest='o') parser.add_argument( '-j', '--default-job', nargs='+', help= 'What task/job should be performed (p~precision/recall, s~scaff-contig consistency, ' 'c~confusion tables, default - if not spec compute all)', metavar='', dest='j') args = parser.parse_args() # read and check the arguments seqIdToBp = None scaffToContig = None binning = None trueBinning = None outputDir = None job = None if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]): outputDir = args.o[0] if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name): binningFile = args.b[0].name binning = cami.readAssignments(binningFile) if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name): trueBinningFile = args.t[0].name trueBinning = cami.readAssignments(trueBinningFile) if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name): seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name) # contigsFileListing = args.f[0].name # for line in open(contigsFileListing): # if os.path.isfile(line.strip()): # d = fasta.getSequenceToBpDict(line.strip()) # if seqIdToBp is None: # seqIdToBp = d # else: # count = len(d) + len(seqIdToBp) # seqIdToBp.update(d) # if count > len(seqIdToBp): # sys.stderr.write('The fasta files contain duplicate entries!') if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name): scaffoldContigMapping = args.m[0].name scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t') taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db') if not os.path.isfile(taxonomyPath): if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]): # build the ncbi taxonomy in the case it doesn't exist ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0])) else: taxonomyPath = None if args.j and len(args.j) > 0 and len( set(args.j).intersection(set(['p', 's', 'c']))) > 0: job = set(args.j) # print job # print args.j # print len(seqIdToBp) # print len(binning) # print len(trueBinning) # print taxonomyPath # print outputDir if ( job is None or 'p' in args.j ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing precision/recall') # precision/recall - no correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv')) out.writeText( acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # precision/recall - with correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD) out = csv.OutFileBuffer( os.path.join(outputDir, 'precision_recall_correction.csv')) out.writeText( acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # compute confusion matrices if ( job is None or 'c' in args.j ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing confusion matrices') confusionMatrix = confusion_matrix.ConfusionMatrix( seqIdToBp, binning, trueBinning, taxonomyPath, RANKS) for rank in RANKS: confusionMatrix.generateConfusionMatrix( rank, os.path.join(outputDir, 'confusion_matrix')) confusionMatrix.close() # compute scaffold contig consistency if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \ and outputDir: print('Computing scaffold-contig consistency') cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt')) out.writeText(cons.getGroupedScaffoldsPrint()) cons.close() out.close() createEvalMetaFile(outputDir)
def _main(): # define arguments parser = argparse.ArgumentParser(description='Default task: PPS+ evaluation', epilog='') parser.add_argument('-b', '--cont-binning-file', nargs=1, type=file, required=True, help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b') parser.add_argument('-t', '--cont-true-binning-file', nargs=1, type=file, required=True, help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t') parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False, help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f') parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False, help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m') parser.add_argument('-n', '--cont-ncbi-taxonomy', nargs=1, required=False, help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir', dest='n') parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True, help='Output directory.', metavar='output_dir', dest='o') parser.add_argument('-j', '--default-job', nargs='+', help='What task/job should be performed (p~precision/recall, s~scaff-contig consistency, ' 'c~confusion tables, default - if not spec compute all)', metavar='', dest='j') args = parser.parse_args() # read and check the arguments seqIdToBp = None scaffToContig = None binning = None trueBinning = None outputDir = None job = None if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]): outputDir = args.o[0] if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name): binningFile = args.b[0].name binning = cami.readAssignments(binningFile) if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name): trueBinningFile = args.t[0].name trueBinning = cami.readAssignments(trueBinningFile) if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name): seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name) # contigsFileListing = args.f[0].name # for line in open(contigsFileListing): # if os.path.isfile(line.strip()): # d = fasta.getSequenceToBpDict(line.strip()) # if seqIdToBp is None: # seqIdToBp = d # else: # count = len(d) + len(seqIdToBp) # seqIdToBp.update(d) # if count > len(seqIdToBp): # sys.stderr.write('The fasta files contain duplicate entries!') if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name): scaffoldContigMapping = args.m[0].name scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t') taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db') if not os.path.isfile(taxonomyPath): if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]): # build the ncbi taxonomy in the case it doesn't exist ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0])) else: taxonomyPath = None if args.j and len(args.j) > 0 and len(set(args.j).intersection(set(['p', 's', 'c']))) > 0: job = set(args.j) # print job # print args.j # print len(seqIdToBp) # print len(binning) # print len(trueBinning) # print taxonomyPath # print outputDir if (job is None or 'p' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing precision/recall') # precision/recall - no correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv')) out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # precision/recall - with correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall_correction.csv')) out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # compute confusion matrices if (job is None or 'c' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing confusion matrices') confusionMatrix = confusion_matrix.ConfusionMatrix(seqIdToBp, binning, trueBinning, taxonomyPath, RANKS) for rank in RANKS: confusionMatrix.generateConfusionMatrix(rank, os.path.join(outputDir, 'confusion_matrix')) confusionMatrix.close() # compute scaffold contig consistency if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \ and outputDir: print('Computing scaffold-contig consistency') cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt')) out.writeText(cons.getGroupedScaffoldsPrint()) cons.close() out.close() createEvalMetaFile(outputDir)
def _init(self, align=True, dm=True, cluster=True): """ Init data, compute: alignment, distance matrix, clusters. """ if self._initDone: return self._initDone = True fastaPathList = [] # fasta files containing regions that correspond to particular marker genes self._mgList = [] # list of names of marker genes mgToFastaPath = dict([]) # marker gene name -> fasta file path #collect regions from Amphora mg for fastaFile in glob.glob(os.path.join(os.path.normpath(self._mgWorkingDir),'*.gff')): fastaPathList.append(fastaFile) for path in fastaPathList: name = re.sub('([^\.]+)\..*$', r'\1' , os.path.basename(path)) mg = re.sub(r'([^_]+)_dna', r'\1',name) dir = os.path.dirname(path) self._mgList.append(mg) mgToFastaPath[mg] = path #add 16S s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA'] for mg in s16List: mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna') self._mgList.append(mg) #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region. mgToFilteredFastaPath = dict([]) mgToSeqNameToTaxPathDict = dict([]) #mg -> seqName (~region name) -> pred for mg in self._mgList: mgToSeqNameToTaxPathDict[mg] = dict([]) for seq in self._sequences.sequences: id = str(str(seq.scaffold.id) + '_' + str(seq.id)) for mg,tag,pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(), seq.getCandidateTaxPathDictList()): mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred #for each marker gene: choose only one sequence region for each mg and sequence #all sequences are predicted at least at superkingdom for mg in self._mgList: seqNameToPred = mgToSeqNameToTaxPathDict[mg] #sequence region predictions for this mg seqNameToSeq = fastaFileToDict(mgToFastaPath[mg]) #read the fasta file outPath = os.path.normpath(os.path.join(self._clustDir, str(mg + '.filter.fna'))) mgToFilteredFastaPath[mg] = outPath out = OutFileBuffer(outPath) seqBaseToSeqName = dict([]) # sequence base (scaffId_seqId) -> region name for seqName in seqNameToSeq: seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*',r'\1', seqName) if seqBase not in seqBaseToSeqName: seqBaseToSeqName[seqBase] = [] seqBaseToSeqName[seqBase].append(seqName) for seqBase in seqBaseToSeqName: seqId = int(re.sub(r'^[0-9]+_([0-9]+)',r'\1', seqBase)) seqBaseTaxPathDict = self._sequences.getSequence(seqId).getTaxonomyPath() list = seqBaseToSeqName[seqBase] candidateSeq = [] # sequence region is predicted at least at rank superkingdom for seqName in list: if seqName not in seqNameToPred: taxPathDict = None else: taxPathDict = seqNameToPred[seqName] if taxPathDict != None: candidateSeq.append(seqName) if len(candidateSeq) == 0: continue candidateSeq2 = [] # sequence regions predicted at least at the same rank as the whole sequence for seqName in candidateSeq: taxPathDict = seqNameToPred[seqName] if ((seqBaseTaxPathDict == None) or (len(taxPathDict) >= len(seqBaseTaxPathDict))): #predict at least at the same level candidateSeq2.append(seqName) if len(candidateSeq2) > 0: #take the longest sequence sMax = candidateSeq2[0] for s in candidateSeq2[1:]: if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]): sMax = s else: #all sequence regions are predicted higher than the sequence sMax = candidateSeq[0] #sequence region with the most specific prediction for s in candidateSeq[1:]: taxPathDictMax = seqNameToPred[sMax] taxPathDictS = seqNameToPred[s] if taxPathDictS == None: continue if taxPathDictMax == None: sMax = s continue if len(taxPathDictMax) < len(taxPathDictS): sMax = s candidateSeq3 = [] #get all sequence regions with the most specific prediction taxPathDictMax = seqNameToPred[sMax] for s in candidateSeq: taxPathDictS = seqNameToPred[s] if taxPathDictMax == None: candidateSeq3.append(s) elif len(taxPathDictS) == len(taxPathDictMax): candidateSeq3.append(s) sMax = candidateSeq3[0] for s in candidateSeq3[1:]: #take the longest sequence if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]): sMax = s out.writeText(str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n')) out.close() mgToAlignPath = dict([]) for mg in self._mgList: mgToAlignPath[mg] = os.path.normpath(os.path.join(self._clustDir, str(mg + '.align.fna'))) #build alignment if align: for mg in self._mgList: alignCmd = str(self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] + ' -quiet') assert os.name == 'posix' predictProc = subprocess.Popen(alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) predictProc.wait() print 'Muscle return code for', mg, ':', predictProc.returncode if predictProc.returncode != 0: sys.stderr.write(str(alignCmd + ' \n')) #compute DM if dm: for mg in self._mgList: mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg] + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code dist:', mg, mothurProc.returncode #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) #self._mgToDM[mg] = forEachLine(distFilePath, DM()) #self._mgToDM[mg].printDM() #cluster if cluster: for mg in self._mgList: distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath + ', method=furthest, hard=t, precision=1000)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code cluster:', mg, mothurProc.returncode #read DM and clusters #sequence predictions self._seqIdToTaxPathDict = dict([]) self._seqIdToWeight = dict([]) for seq in self._sequences.sequences: id = int(seq.id) self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath() self._seqIdToWeight[id] = seq.getTaxonomyPathWeight() #similarity thresholds thresholds = self._configMG.get('mgSimilarityThresholds') self._mgToMaxThreshold = dict([]) tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment = '#') for k in tmpDict: self._mgToMaxThreshold[k] = float(tmpDict[k][0]) self._mgToDM = dict([]) self._mgToCluster = dict([]) for mg in self._mgList: file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) self._mgToDM[mg] = forEachLine(file, DM()) file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list')) self._mgToCluster[mg] = forEachLine(file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
def _main(): """ See the module description.""" parser = argparse.ArgumentParser(description=__doc__, epilog="""""") parser.add_argument('-i', '--input-data-dir', action='store', nargs=1, required=True, help="""Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv") file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv") will be considered. (Takes only Bacteria and Archaea)""", metavar='input_dir', dest='inDir') parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True, help='Directory that contains the output files.', metavar='out_dir', dest='outDir') parser.add_argument('-s', '--source-type', required=True, nargs=1, choices=["s","a"], help='To determine the source, use "s" for the Silva database and "a" for the Amphora database.', dest='srcType') parser.add_argument('-t', '--taxonomy-file', nargs=1, type=file, required=True, help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db', dest='taxonomy') parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1, help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)', metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\ ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\ '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\ '1077529,361146,511563,361147"', dest='filterOut') # parse arguments args = parser.parse_args() inDir = args.inDir[0] outDir = args.outDir[0] srcType = args.srcType[0] filterOutTaxonIdsSet = set() try: if args.filterOut: filterOutTaxonIdsSet.update(set(map(int, str(args.filterOut[0]).split(',')))) except: print('Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!') raise taxonomy = TaxonomyWrap(args.taxonomy[0].name) for dir in [inDir, outDir]: assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!' # create db for each gene mapDict = {} # map: seqId -> ncbid for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')): # *.csv or *.tax assert mapFilePath.endswith(('.csv', '.tax')), \ 'The mapping files can either end with .csv or .tax ' + mapFilePath base = os.path.basename(mapFilePath).rsplit('.', 1)[0] # cut out dir path and suffix fastaDict = fas.fastaFileToDict(os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq print("Processing: %s seq count: %s" % (base, str(len(fastaDict)))) if 'a' in srcType: # Amphora mapDict = {} for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'): v = int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid assert ((k not in mapDict) or (mapDict[k] == v)), str( 'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath) mapDict[k] = v elif 's' in srcType: # Silva mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t') mapDict = {} for k, v in mapTmp.iteritems(): mapDict[k] = int(v[0]) else: assert False, 'Unsupported source type!' # same number of entries in both files (fasta and mapping) ? if len(mapDict) != len(fastaDict): print(str('%s: The mapping file and the corresponding fasta file have different number of entries: ' + '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict)))) continue # are duplicates in the mapping file ? count = len(csv.getColumnAsList(mapFilePath)) if len(mapDict) != count: print('%s: The mapping file contained duplicates! unique: %s non-unique: %s' % ( base, str(len(mapDict)), str(count))) # store data to the output directory outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna'))) outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax'))) count = 0 filteredLeaf = 0 filteredSup = 0 notMapped = 0 noBacArch = 0 for seqId, taxonId in mapDict.iteritems(): if taxonId in filterOutTaxonIdsSet: filteredLeaf += 1 continue path = taxonomy.getPathToRoot(taxonId) if path is None: print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId)) notMapped += 1 continue topLevel = int(path.split(';', 1)[0]) if topLevel in filterOutTaxonIdsSet: filteredSup += 1 continue if topLevel not in [2, 2157]: # Bacteria, Archaea noBacArch += 1 print('NoBactArch: ', topLevel) seq = fastaDict[seqId] if 'a' in srcType: # Amphora id = seqId elif 's' in srcType: # Silva id = str(seqId + '|ncbid:' + str(taxonId)) outTax.writeText(str(id + '\t' + path + '\n')) outDna.writeText(str('>' + id + '\n' + seq + '\n')) count += 1 outDna.close() outTax.close() print('Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' % (count, filteredLeaf, filteredSup, notMapped)) if noBacArch > 0: print('WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch)) # Silva: #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ... # Amphora # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db taxonomy.close() print 'done'
def _init(self, align=True, dm=True, cluster=True): """ Init data, compute: alignment, distance matrix, clusters. """ if self._initDone: return self._initDone = True fastaPathList = [ ] # fasta files containing regions that correspond to particular marker genes self._mgList = [] # list of names of marker genes mgToFastaPath = dict([]) # marker gene name -> fasta file path #collect regions from Amphora mg for fastaFile in glob.glob( os.path.join(os.path.normpath(self._mgWorkingDir), '*.gff')): fastaPathList.append(fastaFile) for path in fastaPathList: name = re.sub('([^\.]+)\..*$', r'\1', os.path.basename(path)) mg = re.sub(r'([^_]+)_dna', r'\1', name) dir = os.path.dirname(path) self._mgList.append(mg) mgToFastaPath[mg] = path #add 16S s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA'] for mg in s16List: mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna') self._mgList.append(mg) #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region. mgToFilteredFastaPath = dict([]) mgToSeqNameToTaxPathDict = dict( []) #mg -> seqName (~region name) -> pred for mg in self._mgList: mgToSeqNameToTaxPathDict[mg] = dict([]) for seq in self._sequences.sequences: id = str(str(seq.scaffold.id) + '_' + str(seq.id)) for mg, tag, pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(), seq.getCandidateTaxPathDictList()): mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred #for each marker gene: choose only one sequence region for each mg and sequence #all sequences are predicted at least at superkingdom for mg in self._mgList: seqNameToPred = mgToSeqNameToTaxPathDict[ mg] #sequence region predictions for this mg seqNameToSeq = fastaFileToDict( mgToFastaPath[mg]) #read the fasta file outPath = os.path.normpath( os.path.join(self._clustDir, str(mg + '.filter.fna'))) mgToFilteredFastaPath[mg] = outPath out = OutFileBuffer(outPath) seqBaseToSeqName = dict( []) # sequence base (scaffId_seqId) -> region name for seqName in seqNameToSeq: seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*', r'\1', seqName) if seqBase not in seqBaseToSeqName: seqBaseToSeqName[seqBase] = [] seqBaseToSeqName[seqBase].append(seqName) for seqBase in seqBaseToSeqName: seqId = int(re.sub(r'^[0-9]+_([0-9]+)', r'\1', seqBase)) seqBaseTaxPathDict = self._sequences.getSequence( seqId).getTaxonomyPath() list = seqBaseToSeqName[seqBase] candidateSeq = [ ] # sequence region is predicted at least at rank superkingdom for seqName in list: if seqName not in seqNameToPred: taxPathDict = None else: taxPathDict = seqNameToPred[seqName] if taxPathDict != None: candidateSeq.append(seqName) if len(candidateSeq) == 0: continue candidateSeq2 = [ ] # sequence regions predicted at least at the same rank as the whole sequence for seqName in candidateSeq: taxPathDict = seqNameToPred[seqName] if ((seqBaseTaxPathDict == None) or (len(taxPathDict) >= len(seqBaseTaxPathDict)) ): #predict at least at the same level candidateSeq2.append(seqName) if len(candidateSeq2) > 0: #take the longest sequence sMax = candidateSeq2[0] for s in candidateSeq2[1:]: if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]): sMax = s else: #all sequence regions are predicted higher than the sequence sMax = candidateSeq[ 0] #sequence region with the most specific prediction for s in candidateSeq[1:]: taxPathDictMax = seqNameToPred[sMax] taxPathDictS = seqNameToPred[s] if taxPathDictS == None: continue if taxPathDictMax == None: sMax = s continue if len(taxPathDictMax) < len(taxPathDictS): sMax = s candidateSeq3 = [ ] #get all sequence regions with the most specific prediction taxPathDictMax = seqNameToPred[sMax] for s in candidateSeq: taxPathDictS = seqNameToPred[s] if taxPathDictMax == None: candidateSeq3.append(s) elif len(taxPathDictS) == len(taxPathDictMax): candidateSeq3.append(s) sMax = candidateSeq3[0] for s in candidateSeq3[1:]: #take the longest sequence if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]): sMax = s out.writeText( str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n')) out.close() mgToAlignPath = dict([]) for mg in self._mgList: mgToAlignPath[mg] = os.path.normpath( os.path.join(self._clustDir, str(mg + '.align.fna'))) #build alignment if align: for mg in self._mgList: alignCmd = str( self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] + ' -quiet') assert os.name == 'posix' predictProc = subprocess.Popen( alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1 ) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) predictProc.wait() print 'Muscle return code for', mg, ':', predictProc.returncode if predictProc.returncode != 0: sys.stderr.write(str(alignCmd + ' \n')) #compute DM if dm: for mg in self._mgList: mothur = os.path.join( os.path.normpath( self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str( 'time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg] + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"' ) assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code dist:', mg, mothurProc.returncode #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) #self._mgToDM[mg] = forEachLine(distFilePath, DM()) #self._mgToDM[mg].printDM() #cluster if cluster: for mg in self._mgList: distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) mothur = os.path.join( os.path.normpath( self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath + ', method=furthest, hard=t, precision=1000)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code cluster:', mg, mothurProc.returncode #read DM and clusters #sequence predictions self._seqIdToTaxPathDict = dict([]) self._seqIdToWeight = dict([]) for seq in self._sequences.sequences: id = int(seq.id) self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath() self._seqIdToWeight[id] = seq.getTaxonomyPathWeight() #similarity thresholds thresholds = self._configMG.get('mgSimilarityThresholds') self._mgToMaxThreshold = dict([]) tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment='#') for k in tmpDict: self._mgToMaxThreshold[k] = float(tmpDict[k][0]) self._mgToDM = dict([]) self._mgToCluster = dict([]) for mg in self._mgList: file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) self._mgToDM[mg] = forEachLine(file, DM()) file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list')) self._mgToCluster[mg] = forEachLine( file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
def main(): """ Wraps pIRS read simulator to simulate Illumina paired end reads. Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg """ if os.name != 'posix': print 'runs only on posix systems' return #parse arguments parser = argparse.ArgumentParser( description= '''A simple Metagenome Illumina read simulator that wraps pIRS''', epilog='''''') parser.add_argument('-c', '--config', nargs=1, type=file, required=True, help='configuration file of the simulator', metavar='configMetagenome.cfg', dest='config') parser.add_argument( '-p', '--pIRS-param', action='store', nargs='+', help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"', dest='p') args = parser.parse_args() config = Config(args.config[0], 'Sim') pirsParam = '' if args.p: pirsParam = args.p[0] #reads configuration workingDir = config.get('workingDir') referenceSeq = config.get('referenceSeq') frequenciesInfo = config.get('frequenciesInfo') coverageFrequencyMultiplier = float( config.get('coverageFrequencyMultiplier')) pirsInstallDir = config.get('pirsInstallDir') insertSizeMean = int(config.get('insertSizeMean')) insertSizeSd = int(config.get('insertSizeSd')) readLength = int(config.get('readLength')) #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config) if (string.count(pirsParam, '-m') != 0 or string.count(pirsParam, '-v') != 0 or string.count(pirsParam, '-l') != 0 or string.count(pirsParam, '-x') != 0 or string.count(pirsParam, '-i') != 0 or string.count(pirsParam, '-o') != 0): print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set ' return #check working directory, create temporary directory tmpDir = os.path.join(workingDir, 'tmp') if not os.path.isdir(workingDir): print str('The working directory does not exists, create it! (' + str(workingDir) + ')') return if not os.path.isdir(tmpDir): os.mkdir(tmpDir) seqNameToSeq = fastaFileToDict(referenceSeq) seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment='#') outReads1Merged = OutFileBuffer(os.path.join(workingDir, 'reads_1.fq')) outReads2Merged = OutFileBuffer(os.path.join(workingDir, 'reads_2.fq')) for seqName in seqNameToFreq: seq = seqNameToSeq[seqName] coverage = float( seqNameToFreq[seqName][0]) * coverageFrequencyMultiplier fastaFile = os.path.join(tmpDir, str(seqName + '.fna')) outBuffer = OutFileBuffer(fastaFile) outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n')) outBuffer.close() cmd = str( os.path.join(pirsInstallDir, 'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' + seqName + ' ' + pirsParam) #print cmd proc = subprocess.Popen( cmd, shell=True, bufsize=-1, cwd=tmpDir) # stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) proc.wait() if proc.returncode != 0: sys.stderr.write(str('command failed: ' + cmd)) #append generated reads to the merged files reads1 = gzip.open( os.path.join( tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb') file1Content = reads1.read() outReads1Merged.writeText( str( file1Content.replace('@read_', str('@read_' + seqName + '_')) + '\n')) reads1.close() reads2 = gzip.open( os.path.join( tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb') file2Content = reads2.read() outReads2Merged.writeText( str( file2Content.replace('@read_', str('@read_' + seqName + '_')) + '\n')) reads2.close() outReads1Merged.close() outReads2Merged.close()
def main(): """ Wraps pIRS read simulator to simulate Illumina paired end reads. Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg """ if os.name != 'posix': print 'runs only on posix systems' return #parse arguments parser = argparse.ArgumentParser(description='''A simple Metagenome Illumina read simulator that wraps pIRS''', epilog='''''') parser.add_argument('-c', '--config', nargs=1, type=file, required=True, help='configuration file of the simulator', metavar='configMetagenome.cfg', dest='config') parser.add_argument('-p', '--pIRS-param', action='store', nargs='+', help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"', dest='p') args = parser.parse_args() config = Config(args.config[0], 'Sim') pirsParam = '' if args.p: pirsParam = args.p[0] #reads configuration workingDir = config.get('workingDir') referenceSeq = config.get('referenceSeq') frequenciesInfo = config.get('frequenciesInfo') coverageFrequencyMultiplier = float(config.get('coverageFrequencyMultiplier')) pirsInstallDir = config.get('pirsInstallDir') insertSizeMean = int(config.get('insertSizeMean')) insertSizeSd = int(config.get('insertSizeSd')) readLength = int(config.get('readLength')) #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config) if (string.count(pirsParam,'-m') != 0 or string.count(pirsParam,'-v') != 0 or string.count(pirsParam,'-l') != 0 or string.count(pirsParam,'-x') != 0 or string.count(pirsParam,'-i') != 0 or string.count(pirsParam,'-o') != 0): print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set ' return #check working directory, create temporary directory tmpDir = os.path.join(workingDir,'tmp') if not os.path.isdir(workingDir): print str('The working directory does not exists, create it! (' + str(workingDir) + ')') return if not os.path.isdir(tmpDir): os.mkdir(tmpDir) seqNameToSeq = fastaFileToDict(referenceSeq) seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment = '#') outReads1Merged = OutFileBuffer(os.path.join(workingDir,'reads_1.fq')) outReads2Merged = OutFileBuffer(os.path.join(workingDir,'reads_2.fq')) for seqName in seqNameToFreq: seq = seqNameToSeq[seqName] coverage = float(seqNameToFreq[seqName][0])*coverageFrequencyMultiplier fastaFile = os.path.join(tmpDir,str(seqName + '.fna')) outBuffer = OutFileBuffer(fastaFile) outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n')) outBuffer.close() cmd = str(os.path.join(pirsInstallDir,'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' + seqName + ' ' + pirsParam) #print cmd proc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=tmpDir)# stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) proc.wait() if proc.returncode != 0: sys.stderr.write(str('command failed: ' + cmd)) #append generated reads to the merged files reads1 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb') file1Content = reads1.read() outReads1Merged.writeText(str(file1Content.replace('@read_',str('@read_' + seqName + '_')) + '\n')) reads1.close() reads2 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb') file2Content = reads2.read() outReads2Merged.writeText(str(file2Content.replace('@read_',str('@read_' + seqName + '_')) + '\n')) reads2.close() outReads1Merged.close() outReads2Merged.close()
def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy, minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True, ignoreScaffPredToRoot=True): """ Initializes the main Consistency class. @param contigNameToBp: dictionary that maps contig names to bp (int); or a fasta file that contain contigs @param contigNameToNcbid: dictionary that maps contig names to ncbids (int); or a prediction file - first column contig name, last column ncbid @param scaffToContigList: dictionary that maps scaffold names to list of contig names; or a file - first column scaffold name, second column contig name @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp) @param cladesSet: consider only scaffolds that contain at least one contig from this set @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds (as artificial scaffolds) @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative) """ # check input options assert minScaffContigCount is None or isinstance(minScaffContigCount, int) assert minScaffBpLen is None or isinstance(minScaffBpLen, int) assert cladesSet is None or isinstance(cladesSet, set) assert isinstance(considerContigWithNoScaff, bool) assert isinstance(ignoreScaffPredToRoot, bool) if isinstance(contigNameToBp, dict): self._contigNameToBp = contigNameToBp elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp): self._contigNameToBp = getSequenceToBpDict(contigNameToBp) else: print("Can't get contig info from: ", contigNameToBp) return if isinstance(contigNameToNcbid, dict): self._contigToPred = contigNameToNcbid elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid): self._contigToPred = cami.readAssignments(contigNameToNcbid) else: print("Can't get prediction info from: ", contigNameToNcbid) return if isinstance(scaffToContigList, dict): self._scaffToContigsList = scaffToContigList elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList): self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t') else: print("Can't get scaffold config mapping from: ", scaffToContigList) return if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()): self._taxonomy = taxonomy elif isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapper(taxonomy) else: print("Can't use taxonomy:", taxonomy) return # check the consistency of the data! # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it for scaff, contigsList in self._scaffToContigsList.iteritems(): removeList = [] for contig in contigsList: if contig not in self._contigNameToBp: removeList.append(contig) for contig in removeList: contigsList.remove(contig) # if a contig was predicted but there is no scaffold assigned to it then this # contig is assigned to an "artificial scaffold" if considerContigWithNoScaff: scaffContigSet = set() for s, l in self._scaffToContigsList.iteritems(): for c in l: scaffContigSet.add(c) aloneContigSet = set() for c in self._contigToPred: if c not in scaffContigSet: aloneContigSet.add(c) for c in aloneContigSet: scaffName = str('scaffold_' + c) # make up a scaffold name assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!' self._scaffToContigsList[scaffName] = [c] # filter out scaffolds according to the input constrains self._scaffolds = dict() for scaffName, contigsList in self._scaffToContigsList.iteritems(): if minScaffContigCount is not None: if len(contigsList) < minScaffContigCount: continue if minScaffBpLen is not None: sum = 0 for contig in contigsList: sum += self._contigNameToBp[contig] if sum < minScaffBpLen: continue if cladesSet is not None: passScaff = False for contig in contigsList: if (contig in self._contigToPred) and (self._contigToPred[contig] in cladesSet): passScaff = True break if not passScaff: continue # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it! s = self._processScaffold(scaffName) if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot): self._scaffolds[scaffName] = s
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir): """ Reads all sequences. For each taxonId creates a file that contain all sequences mapped to this taxonId. If a seqId appears more than one it is ignored since acession numbers are unique. @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq """ taxonIdToOutBuffer = {} seqIdSet = set() totalSeqCount = 0 totalStoredSeqCount = 0 totalIdenticalSeqCount = 0 for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList): print 'processing', mapFilePath, fastaFilePath seqCount = 0 storedSeqCount = 0 seqIdToSeq = fasta.fastaFileToDict(fastaFilePath) seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#') for seqId, seq in seqIdToSeq.iteritems(): seqCount += 1 if seqId in seqIdSet: totalIdenticalSeqCount += 1 continue else: seqIdSet.add(seqId) taxonId = seqIdToNcbidList[seqId][0] if taxonId not in taxonIdToOutBuffer: outBuffer = csv.OutFileBuffer( os.path.join(outputDir, str(str(taxonId) + '.fna'))) taxonIdToOutBuffer[taxonId] = outBuffer taxonIdToOutBuffer[taxonId].writeText( str('>' + seqId + '\n' + seq + '\n')) taxonIdToOutBuffer[taxonId].close() storedSeqCount += 1 if len(string.replace(common.noNewLine(seq), 'N', '')) == 0: print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq)) # for buff in taxonIdToOutBuffer.values(): # buff.close() print 'totalSeq, storedSeq', seqCount, storedSeqCount totalSeqCount += seqCount totalStoredSeqCount += storedSeqCount print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount print 'sequences merged'
def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy, minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True, ignoreScaffPredToRoot=True): """ Initializes the main Consistency class. @param contigNameToBp: dictionary that maps contig names to bp (int); or a fasta file that contain contigs @param contigNameToNcbid: dictionary that maps contig names to ncbids (int); or a prediction file - first column contig name, last column ncbid @param scaffToContigList: dictionary that maps scaffold names to list of contig names; or a file - first column scaffold name, second column contig name @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp) @param cladesSet: consider only scaffolds that contain at least one contig from this set @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds (as artificial scaffolds) @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative) """ # check input options assert minScaffContigCount is None or isinstance( minScaffContigCount, int) assert minScaffBpLen is None or isinstance(minScaffBpLen, int) assert cladesSet is None or isinstance(cladesSet, set) assert isinstance(considerContigWithNoScaff, bool) assert isinstance(ignoreScaffPredToRoot, bool) if isinstance(contigNameToBp, dict): self._contigNameToBp = contigNameToBp elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp): self._contigNameToBp = getSequenceToBpDict(contigNameToBp) else: print("Can't get contig info from: ", contigNameToBp) return if isinstance(contigNameToNcbid, dict): self._contigToPred = contigNameToNcbid elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid): self._contigToPred = cami.readAssignments(contigNameToNcbid) else: print("Can't get prediction info from: ", contigNameToNcbid) return if isinstance(scaffToContigList, dict): self._scaffToContigsList = scaffToContigList elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList): self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t') else: print("Can't get scaffold config mapping from: ", scaffToContigList) return if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()): self._taxonomy = taxonomy elif isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapper(taxonomy) else: print("Can't use taxonomy:", taxonomy) return # check the consistency of the data! # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it for scaff, contigsList in self._scaffToContigsList.iteritems(): removeList = [] for contig in contigsList: if contig not in self._contigNameToBp: removeList.append(contig) for contig in removeList: contigsList.remove(contig) # if a contig was predicted but there is no scaffold assigned to it then this # contig is assigned to an "artificial scaffold" if considerContigWithNoScaff: scaffContigSet = set() for s, l in self._scaffToContigsList.iteritems(): for c in l: scaffContigSet.add(c) aloneContigSet = set() for c in self._contigToPred: if c not in scaffContigSet: aloneContigSet.add(c) for c in aloneContigSet: scaffName = str('scaffold_' + c) # make up a scaffold name assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!' self._scaffToContigsList[scaffName] = [c] # filter out scaffolds according to the input constrains self._scaffolds = dict() for scaffName, contigsList in self._scaffToContigsList.iteritems(): if minScaffContigCount is not None: if len(contigsList) < minScaffContigCount: continue if minScaffBpLen is not None: sum = 0 for contig in contigsList: sum += self._contigNameToBp[contig] if sum < minScaffBpLen: continue if cladesSet is not None: passScaff = False for contig in contigsList: if (contig in self._contigToPred) and ( self._contigToPred[contig] in cladesSet): passScaff = True break if not passScaff: continue # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it! s = self._processScaffold(scaffName) if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot): self._scaffolds[scaffName] = s
def _main(): """ See the module description.""" parser = argparse.ArgumentParser(description=__doc__, epilog="""""") parser.add_argument( '-i', '--input-data-dir', action='store', nargs=1, required=True, help= """Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv") file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv") will be considered. (Takes only Bacteria and Archaea)""", metavar='input_dir', dest='inDir') parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True, help='Directory that contains the output files.', metavar='out_dir', dest='outDir') parser.add_argument( '-s', '--source-type', required=True, nargs=1, choices=["s", "a"], help= 'To determine the source, use "s" for the Silva database and "a" for the Amphora database.', dest='srcType') parser.add_argument( '-t', '--taxonomy-file', nargs=1, type=file, required=True, help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db', dest='taxonomy') parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1, help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)', metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\ ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\ '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\ '1077529,361146,511563,361147"', dest='filterOut') # parse arguments args = parser.parse_args() inDir = args.inDir[0] outDir = args.outDir[0] srcType = args.srcType[0] filterOutTaxonIdsSet = set() try: if args.filterOut: filterOutTaxonIdsSet.update( set(map(int, str(args.filterOut[0]).split(',')))) except: print( 'Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!' ) raise taxonomy = _TaxonomyWrap(args.taxonomy[0].name) for dir in [inDir, outDir]: assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!' # create db for each gene mapDict = {} # map: seqId -> ncbid for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')): # *.csv or *.tax assert mapFilePath.endswith(('.csv', '.tax')), \ 'The mapping files can either end with .csv or .tax ' + mapFilePath base = os.path.basename(mapFilePath).rsplit( '.', 1)[0] # cut out dir path and suffix fastaDict = fas.fastaFileToDict( os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq print("Processing: %s seq count: %s" % (base, str(len(fastaDict)))) if 'a' in srcType: # Amphora mapDict = {} for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'): v = int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid assert ((k not in mapDict) or (mapDict[k] == v)), str( 'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath) mapDict[k] = v elif 's' in srcType: # Silva mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t') mapDict = {} for k, v in mapTmp.iteritems(): mapDict[k] = int(v[0]) else: assert False, 'Unsupported source type!' # same number of entries in both files (fasta and mapping) ? if len(mapDict) != len(fastaDict): print( str('%s: The mapping file and the corresponding fasta file have different number of entries: ' + '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict)))) continue # are duplicates in the mapping file ? count = len(csv.getColumnAsList(mapFilePath)) if len(mapDict) != count: print( '%s: The mapping file contained duplicates! unique: %s non-unique: %s' % (base, str(len(mapDict)), str(count))) # store data to the output directory outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna'))) outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax'))) count = 0 filteredLeaf = 0 filteredSup = 0 notMapped = 0 noBacArch = 0 for seqId, taxonId in mapDict.iteritems(): if taxonId in filterOutTaxonIdsSet: filteredLeaf += 1 continue path = taxonomy.getPathToRoot(taxonId) if path is None: print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId)) notMapped += 1 continue topLevel = int(path.split(';', 1)[0]) if topLevel in filterOutTaxonIdsSet: filteredSup += 1 continue if topLevel not in [2, 2157]: # Bacteria, Archaea noBacArch += 1 print('NoBactArch: ', topLevel) seq = fastaDict[seqId] if 'a' in srcType: # Amphora id = seqId elif 's' in srcType: # Silva id = str(seqId + '|ncbid:' + str(taxonId)) outTax.writeText(str(id + '\t' + path + '\n')) outDna.writeText(str('>' + id + '\n' + seq + '\n')) count += 1 outDna.close() outTax.close() print( 'Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' % (count, filteredLeaf, filteredSup, notMapped)) if noBacArch > 0: print( 'WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch)) # Silva: #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ... # Amphora # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db taxonomy.close() print 'done'