def makeAlignmentsOfMultipleHits(self, outDir, markerFile, hmmTableFile, binIdToModels, binIdToBinMarkerSets, bIgnoreThresholds, evalueThreshold, lengthThreshold, alignOutputDir, ): """Align markers with multiple hits within a bin.""" makeSurePathExists(alignOutputDir) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # align any markers with multiple hits in a bin self.logger.info(' Aligning marker genes with multiple hits in a single bin:') # process each bin in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() for binId in binIdToModels: workerQueue.put(binId) for _ in range(self.totalThreads): workerQueue.put(None) try: calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)] writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put(None) writeProc.join() except: # make sure all processes are terminated for p in calcProc: p.terminate() writeProc.terminate()
def makeAlignmentToPhyloMarkers(self, outDir, hmmModelFile, hmmTableFile, binIdToModels, bIgnoreThresholds, evalueThreshold, lengthThreshold, bReportHitStats, alignOutputDir, bKeepUnmaskedAlign=False): """Align hits to a set of common marker genes.""" self.logger.info("Extracting marker genes to align.") # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # extract the ORFs to align markerSeqs, markerStats = self.__extractMarkerSeqsUnique( outDir, resultsParser) # generate individual HMMs required to create multiple sequence alignments binId = list(binIdToModels.keys())[0] hmmModelFiles = {} self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles) # align each of the marker genes makeSurePathExists(alignOutputDir) self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign) # remove the temporary HMM files for fileName in hmmModelFiles: os.remove(hmmModelFiles[fileName]) return resultsParser
def makeAlignmentToPhyloMarkers(self, outDir, hmmModelFile, hmmTableFile, binIdToModels, bIgnoreThresholds, evalueThreshold, lengthThreshold, bReportHitStats, alignOutputDir, bKeepUnmaskedAlign=False ): """Align hits to a set of common marker genes.""" self.logger.info(" Extracting marker genes to align.") # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # extract the ORFs to align markerSeqs, markerStats = self.__extractMarkerSeqsUnique(outDir, resultsParser) # generate individual HMMs required to create multiple sequence alignments binId = binIdToModels.keys()[0] hmmModelFiles = {} self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles) # align each of the marker genes makeSurePathExists(alignOutputDir) self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign) # remove the temporary HMM files for fileName in hmmModelFiles: os.remove(hmmModelFiles[fileName]) return resultsParser
def run(self, binFiles, outDir, hmmTableFile, binIdToModels, binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp, maxMergedCont): checkDirExists(outDir) self.logger.info(' Comparing marker sets between all pairs of bins.') # ensure all bins are using the same marker set markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys( )[0]].mostSpecificMarkerSet().getMarkerGenes() for binIdJ in binIdToBinMarkerSets: if markerGenesI != binIdToBinMarkerSets[ binIdJ].mostSpecificMarkerSet().getMarkerGenes(): self.logger.error( ' [Error] All bins must use the same marker set to assess potential mergers.' ) sys.exit(0) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile) # determine union and intersection of marker sets for each pair of bins outputFile = os.path.join(outDir, "merger.tsv") fout = open(outputFile, 'w') fout.write('Bin Id 1\tBin Id 2') fout.write('\tBin 1 completeness\tBin 1 contamination') fout.write('\tBin 2 completeness\tBin 2 contamination') fout.write('\tDelta completeness\tDelta contamination\tMerger delta') fout.write('\tMerged completeness\tMerged contamination\n') binMarkerHits = resultsParser.results binIds = sorted(binMarkerHits.keys()) for i in range(0, len(binMarkerHits)): binIdI = binIds[i] geneCountsI = binMarkerHits[binIdI].geneCounts( binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True) completenessI, contaminationI = geneCountsI[6:8] for j in range(i + 1, len(binMarkerHits)): binIdJ = binIds[j] geneCountsJ = binMarkerHits[binIdJ].geneCounts( binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True) completenessJ, contaminationJ = geneCountsJ[6:8] # merge together hits from both bins and calculate completeness and contamination mergedHits = {} for markerId, hits in binMarkerHits[ binIdI].markerHits.iteritems(): mergedHits[markerId] = list(hits) for markerId, hits in binMarkerHits[ binIdJ].markerHits.iteritems(): if markerId in mergedHits: mergedHits[markerId].extend(hits) else: mergedHits[markerId] = hits geneCountsMerged = binMarkerHits[binIdI].geneCounts( binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True) completenessMerged, contaminationMerged = geneCountsMerged[6:8] if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont): continue # calculate merged statistics deltaComp = completenessMerged - max(completenessI, completenessJ) deltaCont = contaminationMerged - max(contaminationI, contaminationJ) delta = deltaComp - deltaCont if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont: fout.write( '%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % (binIdI, binIdJ, completenessI, contaminationI, completenessJ, contaminationJ, deltaComp, deltaCont, delta, completenessMerged, contaminationMerged)) fout.close() return outputFile
def run(self, binFiles, outDir, hmmTableFile, binIdToModels, binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp, maxMergedCont): checkDirExists(outDir) self.logger.info(' Comparing marker sets between all pairs of bins.') # ensure all bins are using the same marker set markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]].mostSpecificMarkerSet().getMarkerGenes() for binIdJ in binIdToBinMarkerSets: if markerGenesI != binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet().getMarkerGenes(): self.logger.error(' [Error] All bins must use the same marker set to assess potential mergers.') sys.exit(0) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile) # determine union and intersection of marker sets for each pair of bins outputFile = os.path.join(outDir, "merger.tsv") fout = open(outputFile, 'w') fout.write('Bin Id 1\tBin Id 2') fout.write('\tBin 1 completeness\tBin 1 contamination') fout.write('\tBin 2 completeness\tBin 2 contamination') fout.write('\tDelta completeness\tDelta contamination\tMerger delta') fout.write('\tMerged completeness\tMerged contamination\n') binMarkerHits = resultsParser.results binIds = sorted(binMarkerHits.keys()) for i in xrange(0, len(binMarkerHits)): binIdI = binIds[i] geneCountsI = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True) completenessI, contaminationI = geneCountsI[6:8] for j in xrange(i + 1, len(binMarkerHits)): binIdJ = binIds[j] geneCountsJ = binMarkerHits[binIdJ].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True) completenessJ, contaminationJ = geneCountsJ[6:8] # merge together hits from both bins and calculate completeness and contamination mergedHits = {} for markerId, hits in binMarkerHits[binIdI].markerHits.iteritems(): mergedHits[markerId] = list(hits) for markerId, hits in binMarkerHits[binIdJ].markerHits.iteritems(): if markerId in mergedHits: mergedHits[markerId].extend(hits) else: mergedHits[markerId] = hits geneCountsMerged = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True) completenessMerged, contaminationMerged = geneCountsMerged[6:8] if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont): continue # calculate merged statistics deltaComp = completenessMerged - max(completenessI, completenessJ) deltaCont = contaminationMerged - max(contaminationI, contaminationJ) delta = deltaComp - deltaCont if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont: fout.write('%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % (binIdI, binIdJ, completenessI, contaminationI, completenessJ, contaminationJ, deltaComp, deltaCont, delta, completenessMerged, contaminationMerged)) fout.close() return outputFile