def extractMappedRead(self, aln, windowStart): """ Given a clipped alignment, convert its coordinates into template space (starts with 0), bundle it up with its features as a MappedRead. """ if isinstance(aln, CmpH5Alignment): die("Arrow does not support CmpH5 files!") assert aln.referenceSpan > 0 def baseFeature(featureName): if aln.reader.hasBaseFeature(featureName): rawFeature = aln.baseFeature(featureName, aligned=False, orientation="native") return rawFeature.clip(0,255).astype(np.uint8) else: return np.zeros((aln.readLength,), dtype=np.uint8) name = aln.readName chemistry = aln.sequencingChemistry strand = cc.StrandType_REVERSE if aln.isReverseStrand else cc.StrandType_FORWARD read = cc.Read(name, aln.read(aligned=False, orientation="native"), cc.Uint8Vector(baseFeature("Ipd").tolist()), cc.Uint8Vector(baseFeature("PulseWidth").tolist()), cc.SNR(aln.hqRegionSnr), chemistry) return cc.MappedRead(read, strand, int(aln.referenceStart - windowStart), int(aln.referenceEnd - windowStart))
def extractMappedRead(self, aln, windowStart): """ Given a clipped alignment, convert its coordinates into template space (starts with 0), bundle it up with its features as a MappedRead. """ if isinstance(aln, CmpH5Alignment): die("Arrow does not support CmpH5 files!") assert aln.referenceSpan > 0 def baseFeature(featureName): if aln.reader.hasBaseFeature(featureName): rawFeature = aln.baseFeature(featureName, aligned=False, orientation="native") return rawFeature.clip(0,255).astype(np.uint8) else: return np.zeros((aln.qLen,), dtype=np.uint8) name = aln.readName chemistry = aln.sequencingChemistry strand = cc.StrandType_REVERSE if aln.isReverseStrand else cc.StrandType_FORWARD read = cc.Read(name, aln.read(aligned=False, orientation="native"), cc.Uint8Vector(baseFeature("Ipd").tolist()), cc.Uint8Vector(baseFeature("PulseWidth").tolist()), cc.SNR(aln.hqRegionSnr), chemistry) return cc.MappedRead(read, strand, int(aln.referenceStart - windowStart), int(aln.referenceEnd - windowStart))
def _algorithmByName(self, name, peekFile): if name == "plurality": from GenomicConsensus.plurality import plurality algo = plurality elif name == "quiver": from GenomicConsensus.quiver import quiver algo = quiver elif name == "arrow": from GenomicConsensus.arrow import arrow algo = arrow elif name == "poa": from GenomicConsensus.poa import poa algo = poa elif name == "best": logging.info("Identifying best algorithm based on input data") from GenomicConsensus import algorithmSelection algoName = algorithmSelection.bestAlgorithm(peekFile.sequencingChemistry) return self._algorithmByName(algoName, peekFile) else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) logging.info("Will use {a} algorithm".format(a=name)) return algo
def _buildParameterSet(parameterSetName, nameValuePairs): chem, modelName = parameterSetName.split(".")[:2] if modelName == "AllQVsModel": model = AllQVsModel elif modelName == "NoMergeQVModel": model = NoMergeQVModel elif modelName == "NoQVsModel": model = NoQVsModel elif modelName == "AllQVsMergingByChannelModel": model = AllQVsMergingByChannelModel elif modelName == "NoQVsMergingByChannelModel": model = NoQVsMergingByChannelModel else: logging.error("Found parameter set for unrecognized model: %s" % modelName) return None if map(fst, nameValuePairs) != model.parameterNames: die("Malformed parameter set file") qvModelParams = cc.QvModelParams(chem, modelName, *[ float(snd(pair)) for pair in nameValuePairs ]) # # Dirty hack for --diploid support, diploid model is scaled # differently. Needs further work. # if parameterSetName == "unknown.NoQVsModel": bandingOptions = cc.BandingOptions(4, 24) fastScoreThreshold = -50 else: bandingOptions = cc.BandingOptions(4, 6) fastScoreThreshold = -12.5 quiverConfig = cc.QuiverConfig(qvModelParams, cc.ALL_MOVES, bandingOptions, fastScoreThreshold) return ParameterSet(parameterSetName, model, chem, quiverConfig)
def _buildParameterSet(parameterSetName, nameValuePairs): chem, modelName = parameterSetName.split(".")[:2] if modelName == "AllQVsModel": model = AllQVsModel elif modelName == "NoMergeQVModel": model = NoMergeQVModel elif modelName == "NoQVsModel": model = NoQVsModel elif modelName == "AllQVsMergingByChannelModel": model = AllQVsMergingByChannelModel elif modelName == "NoQVsMergingByChannelModel": model = NoQVsMergingByChannelModel else: logging.error("Found parameter set for unrecognized model: %s" % modelName) return None if map(fst, nameValuePairs) != model.parameterNames: die("Malformed parameter set file") qvModelParams = cc.QvModelParams( chem, modelName, *[float(snd(pair)) for pair in nameValuePairs]) # # Dirty hack for --diploid support, diploid model is scaled # differently. Needs further work. # if parameterSetName == "unknown.NoQVsModel": bandingOptions = cc.BandingOptions(4, 24) fastScoreThreshold = -50 else: bandingOptions = cc.BandingOptions(4, 6) fastScoreThreshold = -12.5 quiverConfig = cc.QuiverConfig(qvModelParams, cc.ALL_MOVES, bandingOptions, fastScoreThreshold) return ParameterSet(parameterSetName, model, chem, quiverConfig)
def _configureAlgorithm(self, options, alnFile): assert self._algorithm != None try: self._algorithmConfiguration = self._algorithm.configure( options, alnFile) except IncompatibleDataException as e: die("Failure: %s" % e.message)
def extractFeatures(aln): """ Extract the data in a cmp.h5 alignment record into a native-orientation gapless string. """ if isinstance(aln, CmpH5Alignment): die("Arrow does not support CmpH5 files!") else: return aln.read(aligned=False, orientation="native")
def _algorithmByName(self, name): if name=="plurality": algo = plurality elif name=="quiver": algo = quiver else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) return algo
def _algorithmByName(self, name): if name == "plurality": algo = plurality elif name == "quiver": algo = quiver else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) return algo
def loadParameterSets(parametersFile=None, spec=None, cmpH5=None): """ spec is either: - chemName.modelName (complete spec), - chemName - None If the spec is incomplete, cmpH5 is required to determine the best available option. Returned value is a dict of completeSpec -> QuiverConfig """ if spec is None: chemistryName, modelName = None, None elif "." in spec: chemistryName, modelName = spec.split(".") else: chemistryName, modelName = spec, None assert cmpH5 or (chemistryName and modelName) parametersFile = _findParametersFile(parametersFile) logging.info("Using Quiver parameters file %s" % parametersFile) sets = _loadParameterSets(parametersFile) if chemistryName and modelName: try: p = sets[spec] params = {"*": p} except: die("Quiver: no available parameter set named %s" % \ spec) elif chemistryName: qvsAvailable = cmpH5.pulseFeaturesAvailable() p = _bestParameterSet(sets, chemistryName, qvsAvailable) if p.chemistry != chemistryName: die("Quiver: no parameter set available compatible with this " + \ "cmp.h5 for chemistry \"%s\" " % chemistryName) params = {"*": p} else: chemistryNames = list(set(cmpH5.sequencingChemistry)) # uniquify if not _isChemistryMixSupported(chemistryNames): logging.warn("Unsupported chemistry mix, results will be undefined: %s" % \ ", ".join(chemistryNames)) qvsAvailable = cmpH5.pulseFeaturesAvailable() bestParams = [ _bestParameterSet(sets, chemistryName, qvsAvailable) for chemistryName in chemistryNames ] params = dict(zip(chemistryNames, bestParams)) return params
def loadParameterSets(parametersFile=None, spec=None, cmpH5=None): """ spec is either: - chemName.modelName (complete spec), - chemName - None If the spec is incomplete, cmpH5 is required to determine the best available option. Returned value is a dict of completeSpec -> QuiverConfig """ if spec is None: chemistryName, modelName = None, None elif "." in spec: chemistryName, modelName = spec.split(".") else: chemistryName, modelName = spec, None assert cmpH5 or (chemistryName and modelName) parametersFile = _findParametersFile(parametersFile) logging.info("Using Quiver parameters file %s" % parametersFile) sets = _loadParameterSets(parametersFile) if chemistryName and modelName: try: p = sets[spec] params = { "*" : p } except: die("Quiver: no available parameter set named %s" % \ spec) elif chemistryName: qvsAvailable = cmpH5.pulseFeaturesAvailable() p = _bestParameterSet(sets, chemistryName, qvsAvailable) if p.chemistry != chemistryName: die("Quiver: no parameter set available compatible with this " + \ "cmp.h5 for chemistry \"%s\" " % chemistryName) params = { "*" : p } else: chemistryNames = list(set(cmpH5.sequencingChemistry)) # uniquify if not _isChemistryMixSupported(chemistryNames): logging.warn("Unsupported chemistry mix, results will be undefined: %s" % \ ", ".join(chemistryNames)) qvsAvailable = cmpH5.pulseFeaturesAvailable() bestParams = [ _bestParameterSet(sets, chemistryName, qvsAvailable) for chemistryName in chemistryNames ] params = dict(zip(chemistryNames, bestParams)) return params
def _algorithmByName(self, name): if name == "plurality": from GenomicConsensus.plurality import plurality algo = plurality elif name == "quiver": from GenomicConsensus.quiver import quiver algo = quiver elif name == "arrow": from GenomicConsensus.arrow import arrow algo = arrow else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) return algo
def configure(options, alnFile): if alnFile.readType != "standard": raise U.IncompatibleDataException( "The Arrow algorithm requires a BAM file containing standard (non-CCS) reads." ) if options.diploid: logging.warn("Diploid analysis not yet supported under Arrow model.") # load parameters from file if options.parametersFile: logging.info("Loading model parameters from: ({0})".format( options.parametersFile)) if not cc.LoadModels(options.parametersFile): die("Arrow: unable to load parameters from: ({0})".format( options.parametersFile)) # test available chemistries supp = set(cc.SupportedChemistries()) logging.info("Found consensus models for: ({0})".format(", ".join( sorted(supp)))) used = set([]) if options.parametersSpec != "auto": logging.info("Overriding model selection with: ({0})".format( options.parametersSpec)) if not cc.OverrideModel(options.parametersSpec): die("Arrow: unable to override model with: ({0})".format( options.parametersSpec)) used.add(options.parametersSpec) else: used.update(alnFile.sequencingChemistry) unsupp = used - supp if used - supp: die("Arrow: unsupported chemistries found: ({0})".format(", ".join( sorted(unsupp)))) # All arrow models require PW except P6 and the first S/P1-C1 for readGroup in alnFile.readGroupTable: if set([readGroup["SequencingChemistry"]]) - set( ["P6-C4", "S/P1-C1/beta"]): if ("Ipd" not in readGroup["BaseFeatures"] or "PulseWidth" not in readGroup["BaseFeatures"]): die("Arrow model requires missing base feature: IPD or PulseWidth" ) logging.info("Using consensus models for: ({0})".format(", ".join( sorted(used)))) return M.ArrowConfig(minMapQV=options.minMapQV, noEvidenceConsensus=options.noEvidenceConsensusCall, computeConfidence=(not options.fastMode), minReadScore=options.minReadScore, minHqRegionSnr=options.minHqRegionSnr, minZScore=options.minZScore, minAccuracy=options.minAccuracy)
def loadParameterSets(parametersFile=None, spec=None, cmpH5=None): """ spec is either: - chemName.modelName (complete spec), - chemName - None If the spec is incomplete, cmpH5 is required to determine the best available option. Returned value is a dict of completeSpec -> QuiverConfig """ if spec is None: chemistryName, modelName = None, None elif "." in spec: chemistryName, modelName = spec.split(".") else: chemistryName, modelName = spec, None assert (cmpH5 is not None) or (chemistryName and modelName) parametersFile = _findParametersFile(parametersFile) logging.info("Using Quiver parameters file %s" % parametersFile) sets = _loadParameterSets(parametersFile) if chemistryName and modelName: try: p = sets[spec] params = {"*": p} except: die("Quiver: no available parameter set named %s" % \ spec) elif chemistryName: qvsAvailable = cmpH5.baseFeaturesAvailable() p = _bestParameterSet(sets, chemistryName, qvsAvailable) if p.chemistry != chemistryName: die("Quiver: no parameter set available compatible with this " + \ "cmp.h5 for chemistry \"%s\" " % chemistryName) params = {"*": p} else: chemistryNames = list(set(cmpH5.sequencingChemistry)) # uniquify if "unknown" in chemistryNames: die("\"unknown\" chemistry in alignment file: either an unsupported chemistry " + "has been used, the alignment file has been improperly constructed, or " + "this version of SMRTanalysis is too old to recognize a new chemistry." ) if not _isChemistryMixSupported(chemistryNames): die("Unsupported chemistry mix, cannot proceed.") qvsAvailable = cmpH5.baseFeaturesAvailable() bestParams = [ _bestParameterSet(sets, chemistryName, qvsAvailable) for chemistryName in chemistryNames ] params = dict(zip(chemistryNames, bestParams)) return params
def configure(options, alnFile): if alnFile.readType != "standard": raise U.IncompatibleDataException( "The Arrow algorithm requires a BAM file containing standard (non-CCS) reads." ) if options.diploid: logging.info( "Diploid polishing in the Arrow model is in *BETA* mode.\n" "Any multi-base string that appears in annotation files\n" "is not phased!") # load parameters from file if options.parametersFile: logging.info("Loading model parameters from: ({0})".format(options.parametersFile)) if not cc.LoadModels(options.parametersFile): die("Arrow: unable to load parameters from: ({0})".format(options.parametersFile)) # test available chemistries supp = set(cc.SupportedChemistries()) logging.info("Found consensus models for: ({0})".format(", ".join(sorted(supp)))) used = set([]) if options.parametersSpec != "auto": logging.info("Overriding model selection with: ({0})".format(options.parametersSpec)) if not cc.OverrideModel(options.parametersSpec): die("Arrow: unable to override model with: ({0})".format(options.parametersSpec)) used.add(options.parametersSpec) else: used.update(alnFile.sequencingChemistry) unsupp = used - supp if used - supp: die("Arrow: unsupported chemistries found: ({0})".format(", ".join(sorted(unsupp)))) # All arrow models require PW except P6 and the first S/P1-C1 for readGroup in alnFile.readGroupTable: if set([readGroup["SequencingChemistry"]]) - set(["P6-C4", "S/P1-C1/beta"]): if ("Ipd" not in readGroup["BaseFeatures"] or "PulseWidth" not in readGroup["BaseFeatures"]): die("Arrow model requires missing base feature: IPD or PulseWidth") logging.info("Using consensus models for: ({0})".format(", ".join(sorted(used)))) return M.ArrowConfig(minMapQV=options.minMapQV, noEvidenceConsensus=options.noEvidenceConsensusCall, computeConfidence=(not options.fastMode), minReadScore=options.minReadScore, minHqRegionSnr=options.minHqRegionSnr, minZScore=options.minZScore, minAccuracy=options.minAccuracy, maskRadius=options.maskRadius, maskErrorRate=options.maskErrorRate, polishDiploid=options.diploid)
def _loadReference(self, cmpH5): logging.info("Loading reference") err = reference.loadFromFile(options.referenceFilename, cmpH5) if err: die("Error loading reference") # Grok the referenceWindow spec, if any. if options.referenceWindowsAsString is None: options.referenceWindows = () elif options.skipUnrecognizedContigs: # This is a workaround for smrtpipe scatter/gather. options.referenceWindows = [] for s in options.referenceWindowsAsString.split(","): try: win = reference.stringToWindow(s) options.referenceWindows.append(win) except: pass else: options.referenceWindows = map(reference.stringToWindow, options.referenceWindowsAsString.split(","))
def loadParameterSets(parametersFile=None, spec=None, cmpH5=None): """ spec is either: - chemName.modelName (complete spec), - chemName - None If the spec is incomplete, cmpH5 is required to determine the best available option. Returned value is a dict of completeSpec -> QuiverConfig """ if spec is None: chemistryName, modelName = None, None elif "." in spec: chemistryName, modelName = spec.split(".") else: chemistryName, modelName = spec, None assert cmpH5 or (chemistryName and modelName) parametersFile = _findParametersFile(parametersFile) logging.info("Using Quiver parameters file %s" % parametersFile) sets = _loadParameterSets(parametersFile) if chemistryName and modelName: try: p = sets[spec] params = { "*" : p } except: die("Quiver: no available parameter set named %s" % \ spec) elif chemistryName: qvsAvailable = cmpH5.pulseFeaturesAvailable() p = _bestParameterSet(sets, chemistryName, qvsAvailable) if p.chemistry != chemistryName: die("Quiver: no parameter set available compatible with this " + \ "cmp.h5 for chemistry \"%s\" " % chemistryName) params = { "*" : p } else: chemistryNames = list(set(cmpH5.sequencingChemistry)) # uniquify if "unknown" in chemistryNames: die("\"unknown\" chemistry in alignment file: either an unsupported chemistry " + "has been used, the alignment file has been improperly constructed, or " + "this version of SMRTanalysis is too old to recognize a new chemistry.") if not _isChemistryMixSupported(chemistryNames): die("Unsupported chemistry mix, cannot proceed.") qvsAvailable = cmpH5.pulseFeaturesAvailable() bestParams = [ _bestParameterSet(sets, chemistryName, qvsAvailable) for chemistryName in chemistryNames ] params = dict(zip(chemistryNames, bestParams)) return params
def _loadReference(self, cmpH5): logging.info("Loading reference") err = reference.loadFromFile(options.referenceFilename, cmpH5) if err: die("Error loading reference") # Grok the referenceWindow spec, if any. if options.referenceWindowsAsString is None: options.referenceWindows = () elif options.skipUnrecognizedContigs: # This is a workaround for smrtpipe scatter/gather. options.referenceWindows = [] for s in options.referenceWindowsAsString.split(","): try: win = reference.stringToWindow(s) options.referenceWindows.append(win) except: pass else: options.referenceWindows = map( reference.stringToWindow, options.referenceWindowsAsString.split(","))
def _algorithmByName(self, name, peekFile): if name == "plurality": from GenomicConsensus.plurality import plurality algo = plurality elif name == "quiver": from GenomicConsensus.quiver import quiver algo = quiver elif name == "arrow": from GenomicConsensus.arrow import arrow algo = arrow # All arrow models require PW except P6 and the first S/P1-C1 for readGroup in peekFile.readGroupTable: if set([readGroup["SequencingChemistry"]]) - set( ["P6-C4", "S/P1-C1/beta"]): if ("Ipd" not in readGroup["BaseFeatures"] or "PulseWidth" not in readGroup["BaseFeatures"]): die("Model requires missing base feature: IPD or PulseWidth" ) elif name == "poa": from GenomicConsensus.poa import poa algo = poa elif name == "best": logging.info("Identifying best algorithm based on input data") from GenomicConsensus import algorithmSelection algoName = algorithmSelection.bestAlgorithm( peekFile.sequencingChemistry) return self._algorithmByName(algoName, peekFile) else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) logging.info("Will use {a} algorithm".format(a=name)) return algo
def _algorithmByName(self, name, peekFile): if name == "plurality": from GenomicConsensus.plurality import plurality algo = plurality elif name == "quiver": from GenomicConsensus.quiver import quiver algo = quiver elif name == "arrow": from GenomicConsensus.arrow import arrow algo = arrow # All arrow models require PW except P6 and the first S/P1-C1 if set(peekFile.sequencingChemistry) - set(["P6-C4", "S/P1-C1/beta"]): if (not peekFile.hasBaseFeature("Ipd") or not peekFile.hasBaseFeature("PulseWidth")): die("Model requires missing base feature: IPD or PulseWidth") elif name == "poa": from GenomicConsensus.poa import poa algo = poa elif name == "best": logging.info("Identifying best algorithm based on input data") from GenomicConsensus import algorithmSelection algoName = algorithmSelection.bestAlgorithm(peekFile.sequencingChemistry) return self._algorithmByName(algoName, peekFile) else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) logging.info("Will use {a} algorithm".format(a=name)) return algo
def _checkFileCompatibility(self, cmpH5): if not cmpH5.isSorted: die("Input CmpH5 file must be sorted.") if cmpH5.isEmpty: die("Input CmpH5 file must be nonempty.")
def _configureAlgorithm(self, options, cmpH5): assert self._algorithm != None try: self._algorithmConfiguration = self._algorithm.configure(options, cmpH5) except IncompatibleDataException as e: die("Failure: %s" % e.message)
def main(self): # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() parseOptions() self._algorithm = self._algorithmByName(options.algorithm) self._setupLogging() random.seed(42) logging.info("h5py version: %s" % h5py.version.version) logging.info("hdf5 version: %s" % h5py.version.hdf5_version) logging.info("ConsensusCore version: %s" % (consensusCoreVersion() or "ConsensusCore unavailable")) logging.info("Starting.") atexit.register(self._cleanup) if options.doProfiling: self._makeTemporaryDirectory() with AlignmentSet(options.inputFilename) as peekFile: if not peekFile.isCmpH5 and not peekFile.hasPbi: logging.warn("'fancyChunking' not yet available for BAM " "files without accompanying .pbi files, " "disabling") options.fancyChunking = False logging.info("Peeking at file %s" % options.inputFilename) logging.info("Input data: numAlnHits=%d" % len(peekFile)) resolveOptions(peekFile) self._loadReference(peekFile) self._checkFileCompatibility(peekFile) self._configureAlgorithm(options, peekFile) options.disableHdf5ChunkCache = True #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile) #if options.disableHdf5ChunkCache: # logging.info("Will disable HDF5 chunk cache (large number of datasets)") #logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) if options.dumpEvidence: self._setupEvidenceDumpDirectory(options.evidenceDirectory) self._launchSlaves() self._readCmpH5Input() monitoringThread = threading.Thread(target=monitorSlaves, args=(self,)) monitoringThread.start() try: if options.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename=os.path.join(options.temporaryDirectory, "profile-main.out")) elif options.doDebugging: if not options.threaded: die("Debugging only works with -T (threaded) mode") logging.info("PID: %d", os.getpid()) import ipdb with ipdb.launch_ipdb_on_exception(): self._mainLoop() else: self._mainLoop() except: why = traceback.format_exc() self.abortWork(why) monitoringThread.join() if self._aborting: logging.error("Aborting") return -1 else: logging.info("Finished.") if options.doProfiling: self._printProfiles() # close h5 file. self._inCmpH5.close() return 0
def main(self): # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() random.seed(42) if options.pdb or options.pdbAtStartup: print("Process ID: %d" % os.getpid(), file=sys.stderr) try: import ipdb except ImportError: die("Debugging options require 'ipdb' package installed.") if not options.threaded: die("Debugging only works with -T (threaded) mode") if options.pdbAtStartup: ipdb.set_trace() logging.info("ConsensusCore version: %s" % (consensusCoreVersion() or "ConsensusCore unavailable")) logging.info("ConsensusCore2 version: %s" % (consensusCore2Version() or "ConsensusCore2 unavailable")) logging.info("Starting.") atexit.register(self._cleanup) if options.doProfiling: self._makeTemporaryDirectory() with AlignmentSet(options.inputFilename) as peekFile: if options.algorithm == "arrow" and peekFile.isCmpH5: die("Arrow does not support CmpH5 files") if not peekFile.isCmpH5 and not peekFile.hasPbi: die("Genomic Consensus only works with cmp.h5 files and BAM " "files with accompanying .pbi files") logging.info("Peeking at file %s" % options.inputFilename) logging.info("Input data: numAlnHits=%d" % len(peekFile)) resolveOptions(peekFile) self._loadReference(peekFile) self._checkFileCompatibility(peekFile) self._algorithm = self._algorithmByName(options.algorithm, peekFile) self._configureAlgorithm(options, peekFile) options.disableHdf5ChunkCache = True #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile) #if options.disableHdf5ChunkCache: # logging.info("Will disable HDF5 chunk cache (large number of datasets)") self._launchSlaves() self._readAlignmentInput() monitoringThread = threading.Thread(target=monitorSlaves, args=(self,)) monitoringThread.start() try: if options.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename=os.path.join(options.temporaryDirectory, "profile-main.out")) elif options.pdb: with ipdb.launch_ipdb_on_exception(): self._mainLoop() else: self._mainLoop() except BaseException as exc: msg = 'options={}'.format(pprint.pformat(vars(options))) logging.exception(msg) self.abortWork(repr(exc)) monitoringThread.join() if self._aborting: logging.error("Aborting") return -1 else: logging.info("Finished.") if options.doProfiling: self._printProfiles() # close h5 file. self._inAlnFile.close() return 0
def _checkFileCompatibility(self, alnFile): if not alnFile.isSorted: die("Input Alignment file must be sorted.") if alnFile.isCmpH5 and alnFile.isEmpty: die("Input Alignment file must be nonempty.")
def _checkFileCompatibility(self, alnFile): if not alnFile.isSorted: die("Input Alignment file must be sorted.") if alnFile.isEmpty: die("Input Alignment file must be nonempty.")
def main(self): # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() self._algorithm = self._algorithmByName(options.algorithm) self._setupLogging() random.seed(42) logging.info("h5py version: %s" % h5py.version.version) logging.info("hdf5 version: %s" % h5py.version.hdf5_version) logging.info("ConsensusCore version: %s" % (consensusCoreVersion() or "ConsensusCore unavailable")) logging.info("ConsensusCore2 version: %s" % (consensusCore2Version() or "ConsensusCore2 unavailable")) logging.info("Starting.") atexit.register(self._cleanup) if options.doProfiling: self._makeTemporaryDirectory() with AlignmentSet(options.inputFilename) as peekFile: if options.algorithm == "arrow" and peekFile.isCmpH5: die("Arrow does not support CmpH5 files") if not peekFile.isCmpH5 and not peekFile.hasPbi: die("Genomic Consensus only works with cmp.h5 files and BAM " "files with accompanying .pbi files") logging.info("Peeking at file %s" % options.inputFilename) logging.info("Input data: numAlnHits=%d" % len(peekFile)) resolveOptions(peekFile) self._loadReference(peekFile) self._checkFileCompatibility(peekFile) self._configureAlgorithm(options, peekFile) options.disableHdf5ChunkCache = True #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile) #if options.disableHdf5ChunkCache: # logging.info("Will disable HDF5 chunk cache (large number of datasets)") #logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) if options.dumpEvidence: self._setupEvidenceDumpDirectory(options.evidenceDirectory) self._launchSlaves() self._readAlignmentInput() monitoringThread = threading.Thread(target=monitorSlaves, args=(self, )) monitoringThread.start() try: if options.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename=os.path.join( options.temporaryDirectory, "profile-main.out")) elif options.debug: if not options.threaded: die("Debugging only works with -T (threaded) mode") logging.info("PID: %d", os.getpid()) import ipdb with ipdb.launch_ipdb_on_exception(): self._mainLoop() else: self._mainLoop() except: why = traceback.format_exc() self.abortWork(why) monitoringThread.join() if self._aborting: logging.error("Aborting") return -1 else: logging.info("Finished.") if options.doProfiling: self._printProfiles() # close h5 file. self._inAlnFile.close() return 0