Пример #1
0
    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus, ))
        logging.info("Requested worker processes: %d" %
                     (self.options.numWorkers, ))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn(
                "More worker processes requested (%d) than CPUs available (%d);"
                " may result in suboptimal performance." %
                (self.options.numWorkers, availableCpus))

        self._initQueues()

        if self.options.threaded:
            self.options.numWorkers = 1
            WorkerType = KineticWorkerThread
        else:
            WorkerType = KineticWorkerProcess

        # Launch the worker processes
        self._workers = []
        for i in xrange(self.options.numWorkers):
            p = WorkerType(self.options,
                           self._workQueue,
                           self._resultsQueue,
                           self.ipdModel,
                           sharedAlignmentSet=self.alignments)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options,
                                                      self._resultsQueue,
                                                      self.refInfo,
                                                      self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(
            target=monitorChildProcesses,
            args=(self._workers + [self._resultCollectorProcess], ))
        self.monitoringThread.start()
    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus,))
        logging.info("Requested worker processes: %d" % (self.options.numWorkers,))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn(
                "More worker processes requested (%d) than CPUs available (%d);"
                " may result in suboptimal performance." % (self.options.numWorkers, availableCpus)
            )

        self._initQueues()

        if self.options.threaded:
            self.options.numWorkers = 1
            WorkerType = KineticWorkerThread
        else:
            WorkerType = KineticWorkerProcess

        # Launch the worker processes
        self._workers = []
        for i in xrange(self.options.numWorkers):
            p = WorkerType(self.options, self._workQueue, self._resultsQueue, self.ipdModel)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options, self._resultsQueue, self.refInfo, self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(
            target=monitorChildProcesses, args=(self._workers + [self._resultCollectorProcess],)
        )
        self.monitoringThread.start()
Пример #3
0
class KineticsToolsRunner(object):
    def __init__(self, args):
        self.args = args
        self.alignments = None

    def start(self):
        self.validateArgs()
        return self.run()

    def getVersion(self):
        return __version__

    def validateArgs(self):
        parser = get_parser()
        if not os.path.exists(self.args.alignment_set):
            parser.error('Input AlignmentSet file provided does not exist')

        if self.args.identify and self.args.control:
            parser.error('--control and --identify are mutally exclusive. Please choose one or the other')

        if self.args.useLDA:
            if self.args.m5Cclassifier is None:
                parser.error('Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.')

        if self.args.m5Cgff:
            if not self.args.useLDA:
                parser.error('m5Cgff file can only be generated in --useLDA mode.')

        # if self.args.methylFraction and not self.args.identify:
        #    parser.error('Currently, --methylFraction only works when the --identify option is specified.')

    def run(self):

        # Figure out what modifications to identify
        mods = self.args.identify
        modsToCall = []
        if mods:
            items = mods.split(",")

            if 'm6A' in items:
                modsToCall.append('H')

            if 'm4C' in items:
                modsToCall.append('J')

            if 'm5C_TET' in items:
                modsToCall.append('K')

            self.args.identify = True
            self.args.modsToCall = modsToCall

        self.options = self.args
        self.options.cmdLine = " ".join(sys.argv)
        self._workers = []

        # set random seed
        # XXX note that this is *not* guaranteed to yield reproducible results
        # indepenently of the number of processing cores used!
        if self.options.randomSeed is not None:
            np.random.seed(self.options.randomSeed)

        if self.args.doProfiling:
            cProfile.runctx("self._mainLoop()",
                            globals=globals(),
                            locals=locals(),
                            filename="profile.out")

        else:
            try:
                ret = self._mainLoop()
            finally:
                # Be sure to shutdown child processes if we get an exception on the main thread
                if not self.args.threaded:
                    for w in self._workers:
                        if w.is_alive():
                            w.terminate()

            return ret

    def _initQueues(self):
        if self.options.threaded:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = Queue.Queue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)
        else:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)

    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus,))
        logging.info("Requested worker processes: %d" % (self.options.numWorkers,))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn("More worker processes requested (%d) than CPUs available (%d);"
                         " may result in suboptimal performance."
                         % (self.options.numWorkers, availableCpus))

        self._initQueues()

        if self.options.threaded:
            self.options.numWorkers = 1
            WorkerType = KineticWorkerThread
        else:
            WorkerType = KineticWorkerProcess
        
        # Launch the worker processes
        self._workers = []
        for i in xrange(self.options.numWorkers):
            p = WorkerType(self.options, self._workQueue, self._resultsQueue,
                self.ipdModel,
                sharedAlignmentSet=self.alignments)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options, self._resultsQueue, self.refInfo, self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(target=monitorChildProcesses, args=(self._workers + [self._resultCollectorProcess],))
        self.monitoringThread.start()

    def _queueChunksForWindow(self, refWindow):
        """
        Compute the chunk extents and queue up the work for a single reference
        """
        winId = refWindow.refId
        winStart = refWindow.start
        winEnd = refWindow.end
        pass

    def loadReferenceAndModel(self, referencePath):
        assert self.alignments is not None and self.referenceWindows is not None
        # Load the reference contigs - annotated with their refID from the cmp.h5
        logging.info("Loading reference contigs %s" % referencePath)
        contigs = ReferenceUtils.loadReferenceContigs(referencePath,
            alignmentSet=self.alignments, windows=self.referenceWindows)

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" % self.args.ipdModel)
                sys.exit(1)
        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadAlignmentChemistry(self.alignments)

            # Temporary solution for Sequel chemistries: we do not
            # have trained kinetics models in hand yet for Sequel
            # chemistries.  However we have observed that the P5-C3
            # training seems to yield fairly good results on Sequel
            # chemistries to date.  So for the moment, we will use
            # that model for Sequel data.
            if majorityChem.startswith("S/"):
                logging.info("No trained model available yet for Sequel chemistries; modeling as P5-C3")
                majorityChem = "P5-C3"

            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
            if majorityChem == 'unknown':
                logging.error("Chemistry cannot be identified---cannot perform kinetic analysis")
                sys.exit(1)
            elif not os.path.exists(ipdModel):
                logging.error("Aborting, no kinetics model available for this chemistry: %s" % ipdModel)
                sys.exit(1)
            else:
                logging.info("Using Chemistry matched IPD model: %s" % ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)

    def loadSharedAlignmentSet(self, cmpH5Filename):
        """
        Read the input AlignmentSet so the indices can be shared with the
        slaves.  This is also used to pass to ReferenceUtils for setting up
        the ipdModel object.
        """
        logging.info("Reading AlignmentSet: %s" % cmpH5Filename)
        logging.info("           reference: %s" % self.args.reference)
        self.alignments = AlignmentSet(cmpH5Filename,
                                       referenceFastaFname=self.args.reference)
        # XXX this should ensure that the file(s) get opened, including any
        # .pbi indices - but need to confirm this
        self.refInfo = self.alignments.referenceInfoTable

    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        #gc.disable()

        self.loadSharedAlignmentSet(self.args.alignment_set)

        # Resolve the windows that will be visited.
        if self.args.referenceWindowsAsString is not None:
            self.referenceWindows = []
            for s in self.args.referenceWindowsAsString.split(","):
                try:
                    win = ReferenceUtils.parseReferenceWindow(s, self.alignments.referenceInfo)
                    self.referenceWindows.append(win)
                except:
                    if self.args.skipUnrecognizedContigs:
                        continue
                    else:
                        raise Exception, "Unrecognized contig!"
        elif self.args.referenceWindowsFromAlignment:
            self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment(self.alignments, self.alignments.referenceInfo)
            refNames = set([rw.refName for rw in self.referenceWindows])
            # limit output to contigs that overlap with reference windows
            self.refInfo = [r for r in self.refInfo if r.Name in refNames]
        else:
            self.referenceWindows = ReferenceUtils.createReferenceWindows(
                self.refInfo)

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference)

        # Spawn workers
        self._launchSlaveProcesses()

        logging.info('Generating kinetics summary for [%s]' % self.args.alignment_set)

        #self.referenceMap = self.alignments['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.alignments['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for window in self.referenceWindows:
            logging.info('Processing window/contig: %s' % (window,))
            for chunk in ReferenceUtils.enumerateChunks(self.args.referenceStride, window):
                self._workQueue.put((self.workChunkCounter, chunk))
                self.workChunkCounter += 1

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        self.alignments.close()
        return 0
Пример #4
0
class KineticsToolsRunner(object):
    def __init__(self, args):
        self.args = args
        self.alignments = None

    def start(self):
        self.validateArgs()
        return self.run()

    def getVersion(self):
        return __version__

    def validateArgs(self):
        parser = get_parser()
        if not os.path.exists(self.args.alignment_set):
            parser.error('Input AlignmentSet file provided does not exist')

        # Over-ride --identify if --control was specified
        if self.args.control:
            self.args.identify = ""

        if self.args.useLDA:
            if self.args.m5Cclassifier is None:
                parser.error(
                    'Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.'
                )

        if self.args.m5Cgff:
            if not self.args.useLDA:
                parser.error(
                    'm5Cgff file can only be generated in --useLDA mode.')

        # if self.args.methylFraction and not self.args.identify:
        #    parser.error('Currently, --methylFraction only works when the --identify option is specified.')

    def run(self):

        # Figure out what modifications to identify
        mods = self.args.identify
        modsToCall = []
        if mods:
            items = mods.split(",")

            if 'm6A' in items:
                modsToCall.append('H')

            if 'm4C' in items:
                modsToCall.append('J')

            if 'm5C_TET' in items:
                modsToCall.append('K')

            self.args.identify = True
            self.args.modsToCall = modsToCall

        self.options = self.args
        self.options.cmdLine = " ".join(sys.argv)
        self._workers = []

        # set random seed
        # XXX note that this is *not* guaranteed to yield reproducible results
        # indepenently of the number of processing cores used!
        if self.options.randomSeed is not None:
            np.random.seed(self.options.randomSeed)

        if self.args.doProfiling:
            cProfile.runctx("self._mainLoop()",
                            globals=globals(),
                            locals=locals(),
                            filename="profile.out")

        else:
            try:
                ret = self._mainLoop()
            finally:
                # Be sure to shutdown child processes if we get an exception on
                # the main thread
                for w in self._workers:
                    if w.is_alive():
                        w.terminate()

            return ret

    def _initQueues(self):
        # Work chunks are created by the main thread and put on this queue
        # They will be consumed by KineticWorker threads, stored in
        # self._workers
        self._workQueue = multiprocessing.JoinableQueue(
            self.options.maxQueueSize)

        # Completed chunks are put on this queue by KineticWorker threads
        # They are consumed by the KineticsWriter process
        self._resultsQueue = multiprocessing.JoinableQueue(
            self.options.maxQueueSize)

    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus, ))
        logging.info("Requested worker processes: %d" %
                     (self.options.numWorkers, ))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn(
                "More worker processes requested (%d) than CPUs available (%d);"
                " may result in suboptimal performance." %
                (self.options.numWorkers, availableCpus))

        self._initQueues()

        # Launch the worker processes
        self._workers = []
        for i in range(self.options.numWorkers):
            p = KineticWorkerProcess(self.options,
                                     self._workQueue,
                                     self._resultsQueue,
                                     self.ipdModel,
                                     sharedAlignmentSet=self.alignments)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options,
                                                      self._resultsQueue,
                                                      self.refInfo,
                                                      self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(
            target=monitorChildProcesses,
            args=(self._workers + [self._resultCollectorProcess], ))
        self.monitoringThread.start()

    def _queueChunksForWindow(self, refWindow):
        """
        Compute the chunk extents and queue up the work for a single reference
        """
        winId = refWindow.refId
        winStart = refWindow.start
        winEnd = refWindow.end
        pass

    def loadReferenceAndModel(self, referencePath, ipdModelFilename):
        assert self.alignments is not None and self.referenceWindows is not None
        # Load the reference contigs - annotated with their refID from the
        # alignments
        logging.info("Loading reference contigs {!r}".format(referencePath))
        contigs = ReferenceUtils.loadReferenceContigs(
            referencePath,
            alignmentSet=self.alignments,
            windows=self.referenceWindows)
        self.ipdModel = IpdModel(contigs, ipdModelFilename,
                                 self.args.modelIters)

    def loadSharedAlignmentSet(self, alignmentFilename):
        """
        Read the input AlignmentSet so the indices can be shared with the
        slaves.  This is also used to pass to ReferenceUtils for setting up
        the ipdModel object.
        """
        logging.info("Reading AlignmentSet: %s" % alignmentFilename)
        logging.info("           reference: %s" % self.args.reference)
        self.alignments = AlignmentSet(alignmentFilename,
                                       referenceFastaFname=self.args.reference)
        # XXX this should ensure that the file(s) get opened, including any
        # .pbi indices - but need to confirm this
        self.refInfo = self.alignments.referenceInfoTable

    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the alignments.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        # gc.disable()

        self.loadSharedAlignmentSet(self.args.alignment_set)

        # Resolve the windows that will be visited.
        if self.args.referenceWindowsAsString is not None:
            self.referenceWindows = []
            for s in self.args.referenceWindowsAsString.split(","):
                try:
                    win = ReferenceUtils.parseReferenceWindow(
                        s, self.alignments.referenceInfo)
                    self.referenceWindows.append(win)
                except BaseException:
                    if self.args.skipUnrecognizedContigs:
                        continue
                    else:
                        raise Exception("Unrecognized contig!")
        elif self.args.referenceWindowsFromAlignment:
            self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment(
                self.alignments, self.alignments.referenceInfo)
            refNames = set([rw.refName for rw in self.referenceWindows])
            # limit output to contigs that overlap with reference windows
            self.refInfo = [r for r in self.refInfo if r.Name in refNames]
        else:
            self.referenceWindows = ReferenceUtils.createReferenceWindows(
                self.refInfo)

        # Load reference and IpdModel
        chemName = ReferenceUtils.loadAlignmentChemistry(self.alignments)
        if self.args.useChemistry is not None:
            chemName = self.args.useChemistry
        ipdModelFilename = loader.getIpdModelFilename(
            ipdModel=self.args.ipdModel,
            majorityChem=chemName,
            paramsPath=self.args.paramsPath)
        self.loadReferenceAndModel(self.args.reference, ipdModelFilename)

        # Spawn workers
        self._launchSlaveProcesses()

        logging.info('Generating kinetics summary for [%s]' %
                     self.args.alignment_set)

        #self.referenceMap = self.alignments['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.alignments['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the alignments.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for window in self.referenceWindows:
            logging.info('Processing window/contig: %s' % (window, ))
            for chunk in ReferenceUtils.enumerateChunks(
                    self.args.referenceStride, window):
                self._workQueue.put((self.workChunkCounter, chunk))
                self.workChunkCounter += 1

        # Shutdown worker threads with None sentinels
        for i in range(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        self.alignments.close()
        return 0
class ReprocessMotifSites(PBToolRunner):
    def __init__(self):
        desc = [
            'For all sites in motifs.gff, reports estimated methylated fraction and 95% confidence interval',
            'Notes: For all command-line arguments, default values are listed in [].'
        ]
        super(ReprocessMotifSites, self).__init__('\n'.join(desc))

        self.parser.add_argument(
            '--numWorkers',
            dest='numWorkers',
            default=-1,  # Defaults to using all logical CPUs
            type=int,
            help='Number of thread to use (-1 uses all logical cpus)')

        self.parser.add_argument('infile',
                                 metavar='input.cmp.h5',
                                 help='Input cmp.h5 filename')

        # self.parser.add_argument('--control',
        #     dest='control',
        #     default=None,
        #     help='cmph.h5 file ')

        # self.parser.add_argument('--outfile',
        #     dest='outfile',
        #     default=None,
        #    help='Use this option to generate all possible output files. Argument here is the root filename of the output files.')

        self.parser.add_argument('--gff',
                                 dest='gff',
                                 default=None,
                                 help='Name of output GFF file [%(default)s]')

        # self.parser.add_argument('--identify',
        #     dest='identify',
        #     default=False,
        #     help='Identify modification types. Comma-separated list of know modification types. Current options are: m6A, m4C, m5C_TET. Cannot be used with --control')

        # self.parser.add_argument('--csv',
        #     dest='csv',
        #     default=None,
        #     help='Name of output CSV file [%(default)s]')

        # self.parser.add_argument('--pickle',
        #     dest='pickle',
        #     default=None,
        #     help='Name of output pickle file [%(default)s]')

        # self.parser.add_argument('--summary_h5',
        #     dest='summary_h5',
        #     default=None,
        #     help='Name of output summary h5 file [%(default)s]')

        self.parser.add_argument('--reference',
                                 dest='reference',
                                 required=True,
                                 help='Path to reference FASTA file')

        self.parser.add_argument(
            "--maxLength",
            default=3e12,
            type=int,
            help="Maximum number of bases to process per contig")

        self.parser.add_argument(
            '--minCoverage',
            dest='minCoverage',
            default=3,
            type=int,
            help='Minimum coverage required to call a modified base')

        self.parser.add_argument('--maxQueueSize',
                                 dest='maxQueueSize',
                                 default=1000,
                                 type=int,
                                 help='Max Queue Size')

        self.parser.add_argument('--maxCoverage',
                                 dest='maxCoverage',
                                 type=int,
                                 default=None,
                                 help='Maximum coverage to use at each site')

        self.parser.add_argument('--mapQvThreshold',
                                 dest='mapQvThreshold',
                                 type=float,
                                 default=-1.0)

        # self.parser.add_argument('--pvalue',
        #     dest='pvalue',
        #     default=0.01,
        #     type=float,
        #     help='p-value required to call a modified base')

        self.parser.add_argument('--subread_norm',
                                 dest='subread_norm',
                                 default=True,
                                 type=lambda x: x != 'False',
                                 help='Normalized subread ipds')

        self.parser.add_argument(
            '--ipdModel',
            dest='ipdModel',
            default=None,
            help='Alternate synthetic IPD model HDF5 file')

        self.parser.add_argument('--cap_percentile',
                                 dest='cap_percentile',
                                 type=float,
                                 default=99.0,
                                 help='Global IPD percentile to cap IPDs at')

        self.parser.add_argument(
            "--threaded",
            "-T",
            action="store_true",
            dest="threaded",
            default=False,
            help=
            "Run threads instead of processes (for debugging purposes only)")

        self.parser.add_argument(
            "--profile",
            action="store_true",
            dest="doProfiling",
            default=False,
            help="Enable Python-level profiling (using cProfile).")

        # self.parser.add_argument("--methylFraction",
        #     action="store_true",
        #     dest="methylFraction",
        #     default=False,
        #     help="In the --identify mode, add --methylFraction to command line to estimate the methylated fraction, along with 95% confidence interval bounds.")

        # The following are in addition to ipdSummary.py's inputs:
        self.parser.add_argument('--motifs',
                                 dest="motifs",
                                 required=True,
                                 help='Name of motifs GFF file [%(default)s]')

        self.parser.add_argument('--motif_summary',
                                 dest="motif_summary",
                                 required=True,
                                 help='Name of motif summary CSV file')

        self.parser.add_argument(
            '--undetected',
            action="store_true",
            dest="undetected",
            default=False,
            help=
            "Setting this flag yields output with only undetected motif sites."
        )

        self.parser.add_argument(
            '--modifications',
            dest="modifications",
            default=None,
            help='Name of modifications GFF file [%(default)s]')

        self.parser.add_argument(
            '--oldData',
            action="store_true",
            dest="oldData",
            default=False,
            help=
            "For datasets prior to 1.3.3 (use this option to increase testing possibilities)"
        )

        # A new addition to ipdSummary.py

        self.parser.add_argument(
            '--paramsPath',
            dest='paramsPath',
            default=None,
            help=
            'Directory containing in-silico trained model for each chemistry')

        self.parser.add_argument(
            '--modelIters',
            dest='modelIters',
            type=int,
            default=-1,
            help='[Internal] Number of GBM model iteration to use')

    def getVersion(self):
        return __version__

    def validateArgs(self):
        if not os.path.exists(self.args.infile):
            self.parser.error('input.cmp.h5 file provided does not exist')

        # Add checks corresponding to new required inputs:
        if not os.path.exists(self.args.motifs):
            self.parser.error('input motifs gff file provided does not exist')

        if not os.path.exists(self.args.motif_summary):
            self.parser.error(
                'input motif_summary csv file provided does not exist')

        if not self.args.undetected and not os.path.exists(
                self.args.modifications):
            self.parser.error(
                'either the --undetected flag must be set, or a valid modifications.gff must be provided'
            )

    def run(self):

        # The following arguments are set in order to use ResultWriter.py as is:
        self.args.methylFraction = True
        self.args.outfile = None
        self.args.csv = None
        self.args.control = None
        self.args.summary_h5 = None
        self.args.pickle = None
        self.args.identify = False
        self.args.pvalue = 1.0

        self.options = self.args
        self.options.cmdLine = " ".join(sys.argv)
        self._workers = []

        # Log generously
        logFormat = '%(asctime)s [%(levelname)s] %(message)s'
        logging.basicConfig(level=logging.INFO, format=logFormat)
        stdOutHandler = logging.StreamHandler(sys.stdout)
        # logging.Logger.root.addHandler(stdOutHandler)
        # logging.info("t1")

        if self.args.doProfiling:
            cProfile.runctx("self._mainLoop()",
                            globals=globals(),
                            locals=locals(),
                            filename="profile-main4.out")

        else:
            try:
                ret = self._mainLoop()
            finally:
                # Be sure to shutdown child processes if we get an exception on the main thread
                if not self.args.threaded:
                    for w in self._workers:
                        if w.is_alive():
                            w.terminate()

            return ret

    def _initQueues(self):
        if self.options.threaded:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = Queue.Queue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(
                self.options.maxQueueSize)
        else:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = multiprocessing.JoinableQueue(
                self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(
                self.options.maxQueueSize)

    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus, ))
        logging.info("Requested worker processes: %d" %
                     (self.options.numWorkers, ))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn(
                "More worker processes requested (%d) than CPUs available (%d);"
                " may result in suboptimal performance." %
                (self.options.numWorkers, availableCpus))

        self._initQueues()

        if self.options.threaded:
            self.options.numWorkers = 1
            WorkerType = KineticWorkerThread
        else:
            WorkerType = KineticWorkerProcess

        # Launch the worker processes
        self._workers = []
        for i in xrange(self.options.numWorkers):
            p = WorkerType(self.options, self._workQueue, self._resultsQueue,
                           self.ipdModel)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options,
                                                      self._resultsQueue,
                                                      self.refInfo,
                                                      self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(
            target=monitorChildProcesses,
            args=(self._workers + [self._resultCollectorProcess], ))
        self.monitoringThread.start()

    def _queueChunksForReference(self):

        # Read in motif_summary.csv
        motifInfo = {}
        reader = csv.reader(open(self.args.motif_summary, 'r'), delimiter=',')
        reader.next()
        if self.options.oldData:
            col = 1
        else:
            col = 2
        for row in reader:
            motifInfo[row[0]] = row[col]

        # Figure out the length of the motifs file:
        motReader = GffReader(self.args.motifs)
        if self.options.undetected:
            motifDicts = [{
                "seqID": x.seqid,
                "type": x.type,
                "score": x.score,
                "pos": x.start,
                "strand": x.strand,
                "attributes": x.attributes
            } for x in motReader if x.type == '.']
        else:
            motifDicts = [{
                "seqID": x.seqid,
                "type": x.type,
                "score": x.score,
                "pos": x.start,
                "strand": x.strand,
                "attributes": x.attributes
            } for x in motReader]

        refLength = len(motifDicts)

        # Maximum number of hits per chunk
        MAX_HITS = 500
        nBases = min(refLength, self.args.maxLength)
        nBlocks = max(self.options.numWorkers * 4, nBases / MAX_HITS)

        # Block layout
        blockSize = min(nBases, max(nBases / nBlocks + 1, 100))
        blockStarts = np.arange(0, nBases, step=blockSize)
        blockEnds = blockStarts + blockSize
        blocks = zip(blockStarts, blockEnds)

        if self.options.undetected:
            self.options.modifications = None

        # Queue up work blocks
        for block in blocks:
            # NOTE! The format of a work chunk is (refId <int>, refStartBase <int>, refEndBase <int>)
            # chunk = (refInfoId, block[0], block[1])
            # chunk = (self.options.motifs, self.refInfo, motifInfo, self.options.modifications, self.options.undetected, self.options.oldData, block[0], block[1])
            chunk = (motifDicts[block[0]:block[1]], self.refInfo, motifInfo,
                     self.options.modifications, self.options.undetected,
                     self.options.oldData, block[0], block[1])
            self._workQueue.put((self.workChunkCounter, chunk))
            self.workChunkCounter += 1

    def loadReference(self):
        # FIXME - support a bare fasta file as well?
        self.referenceEntry = ReferenceEntry(self.args.reference)
        self.refInfo = self.referenceEntry.contigs

        if self.args.ipdModel:
            self.lutPath = self.args.ipdModel
            if not os.path.exists(self.lutPath):
                logging.info("Couldn't find model file: %s" % self.lutPath)
                raise Exception("Couldn't find model file: %s" % self.lutPath)
        else:
            self.lutPath = None

        self.ipdModel = IpdModel(self.referenceEntry, self.lutPath)

    def loadReferenceAndModel(self, referencePath, cmpH5Path):

        # Load the reference contigs - annotated with their refID from the cmp.h5
        contigs = ReferenceUtils.loadReferenceContigs(referencePath, cmpH5Path)

        # Read reference info table from cmp.h5
        (refInfoTable, _) = ReferenceUtils.loadCmpH5Tables(cmpH5Path)
        self.refInfo = refInfoTable

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's chemistry info
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" %
                              self.args.ipdModel)

        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" %
                              self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadCmpH5Chemistry(cmpH5Path)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")

            if majorityChem == 'unknown':
                logging.warning(
                    "Chemistry is unknown. Falling back to built-in model")
                ipdModel = None
            elif not os.path.exists(ipdModel):
                logging.warning("Model not found: %s" % ipdModel)
                logging.warning("Falling back to built-in model")
                ipdModel = None
            else:
                logging.info("Using Chemistry matched IPD model: %s" %
                             ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)

    def _mainLoop(self):

        # See comments in ipdSummary.py
        gc.disable()

        # Load reference and IpdModel
        # self.loadReference()

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference, self.args.infile)

        # Spawn workers
        self._launchSlaveProcesses()

        # cmp.h5 we're using -- use this to orchestrate the work
        self.cmph5 = CmpH5Reader(self.args.infile)
        logging.info('Generating kinetics summary for [%s]' % self.args.infile)

        self.workChunkCounter = 0
        self._queueChunksForReference()

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("reprocessMotifSites.py finished. Exiting.")
        del self.cmph5
        return 0
Пример #6
0
class KineticsToolsRunner(object):
    def __init__(self):
        desc = [
            'Tool for detecting DNA base-modifications from kinetic signatures',
            'Notes: For all command-line arguments, default values are listed in [].'
        ]
        description = '\n'.join(desc)

        self.parser = argparse.ArgumentParser(
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            description=description)

        # Positional arguments:

        self.parser.add_argument('infile',
                                 metavar='input.cmp.h5',
                                 type=validateFile,
                                 help='Input cmp.h5 filename')

        # Optional arguments:

        # Output options:

        # Note: reference is actually not optional:

        self.parser.add_argument('--reference',
                                 type=validateFile,
                                 help='Path to reference FASTA file')

        self.parser.add_argument(
            '--outfile',
            dest='outfile',
            default=None,
            help=
            'Use this option to generate all possible output files. Argument here is the root filename of the output files.'
        )

        self.parser.add_argument('--gff',
                                 dest='gff',
                                 default=None,
                                 help='Name of output GFF file')

        # FIXME: Need to add an extra check for this; it can only be used if --useLDA flag is set.
        self.parser.add_argument(
            '--m5Cgff',
            dest='m5Cgff',
            default=None,
            help='Name of output GFF file containing m5C scores')

        # FIXME: Make sure that this is specified if --useLDA flag is set.
        self.parser.add_argument(
            '--m5Cclassifer',
            dest='m5Cclassifier',
            default=None,
            help='Specify csv file containing a 127 x 2 matrix')

        self.parser.add_argument('--csv',
                                 dest='csv',
                                 default=None,
                                 help='Name of output CSV file.')

        self.parser.add_argument(
            '--csv_h5',
            dest='csv_h5',
            default=None,
            help='Name of csv output to be written in hdf5 format.')

        self.parser.add_argument('--pickle',
                                 dest='pickle',
                                 default=None,
                                 help='Name of output pickle file.')

        self.parser.add_argument('--summary_h5',
                                 dest='summary_h5',
                                 default=None,
                                 help='Name of output summary h5 file.')

        self.parser.add_argument('--ms_csv',
                                 dest='ms_csv',
                                 default=None,
                                 help='Multisite detection CSV file.')

        # Calculation options:

        self.parser.add_argument(
            '--control',
            dest='control',
            default=None,
            type=validateNoneOrFile,
            help=
            'cmph.h5 file containing a control sample. Tool will perform a case-control analysis'
        )

        self.parser.add_argument(
            '--identify',
            dest='identify',
            default=False,
            help=
            'Identify modification types. Comma-separated list of know modification types. Current options are: m6A, m4C, m5C_TET. Cannot be used with --control'
        )

        self.parser.add_argument(
            "--methylFraction",
            action="store_true",
            dest="methylFraction",
            default=False,
            help=
            "In the --identify mode, add --methylFraction to command line to estimate the methylated fraction, along with 95%% confidence interval bounds."
        )

        # Temporary addition to test LDA for Ca5C detection:
        self.parser.add_argument(
            '--useLDA',
            action="store_true",
            dest='useLDA',
            default=False,
            help='Set this flag to debug LDA for m5C/Ca5C detection')

        # Parameter options:

        self.parser.add_argument(
            '--paramsPath',
            dest='paramsPath',
            default=_getResourcePath(),
            type=validateNoneOrDir,
            help=
            'Directory containing in-silico trained model for each chemistry')

        self.parser.add_argument(
            "--maxLength",
            default=3e12,
            type=int,
            help="Maximum number of bases to process per contig")

        self.parser.add_argument(
            '--minCoverage',
            dest='minCoverage',
            default=3,
            type=int,
            help='Minimum coverage required to call a modified base')

        self.parser.add_argument('--maxQueueSize',
                                 dest='maxQueueSize',
                                 default=20,
                                 type=int,
                                 help='Max Queue Size')

        self.parser.add_argument('--maxCoverage',
                                 dest='maxCoverage',
                                 type=int,
                                 default=-1,
                                 help='Maximum coverage to use at each site')

        self.parser.add_argument('--mapQvThreshold',
                                 dest='mapQvThreshold',
                                 type=float,
                                 default=-1.0)

        self.parser.add_argument(
            '--pvalue',
            dest='pvalue',
            default=0.01,
            type=float,
            help='p-value required to call a modified base')

        self.parser.add_argument(
            '--ipdModel',
            dest='ipdModel',
            default=None,
            help='Alternate synthetic IPD model HDF5 file')

        self.parser.add_argument(
            '--modelIters',
            dest='modelIters',
            type=int,
            default=-1,
            help='[Internal] Number of GBM model iteration to use')

        self.parser.add_argument('--cap_percentile',
                                 dest='cap_percentile',
                                 type=float,
                                 default=99.0,
                                 help='Global IPD percentile to cap IPDs at')

        self.parser.add_argument(
            "--methylMinCov",
            type=int,
            dest='methylMinCov',
            default=10,
            help=
            "Do not try to estimate methylFraction unless coverage is at least this."
        )

        self.parser.add_argument(
            "--identifyMinCov",
            type=int,
            dest='identifyMinCov',
            default=5,
            help=
            "Do not try to identify the modification type unless coverage is at least this."
        )

        # Computation management options:

        self.parser.add_argument("--refContigs", "-w",
                                 type=str,
                                 dest='refContigs',
                                 default=None,
                                 help="Specify one reference contig name, or multiple comma-separated " \
                                      "contig names, to be processed.  By default, processes all "      \
                                      "contigs with mapped coverage.")

        def slurpWindowFile(fname):
            return ",".join(map(str.strip, open(fname).readlines()))

        self.parser.add_argument(
            "--refContigIndex",
            type=int,
            dest='refContigIndex',
            default=-1,
            help=
            "For debugging purposes only - rather than enter a reference contig name, simply enter an index"
        )

        self.parser.add_argument(
            "--refContigsFile",
            "-W",
            type=slurpWindowFile,
            dest='refContigs',
            default=None,
            help="A file containing contig names, one per line")

        self.parser.add_argument(
            '--numWorkers',
            dest='numWorkers',
            default=-1,  # Defaults to using all logical CPUs
            type=int,
            help='Number of thread to use (-1 uses all logical cpus)')

        # Debugging help options:

        self.parser.add_argument(
            "--threaded",
            "-T",
            action="store_true",
            dest="threaded",
            default=False,
            help=
            "Run threads instead of processes (for debugging purposes only)")

        self.parser.add_argument(
            "--profile",
            action="store_true",
            dest="doProfiling",
            default=False,
            help="Enable Python-level profiling (using cProfile).")

        self.parser.add_argument(
            '--pdb',
            action='store_true',
            dest="usePdb",
            default=False,
            help=
            "Enable dropping down into pdb debugger if an Exception is raised."
        )

        # Verbosity
        self.parser.add_argument("--verbose",
                                 "-v",
                                 action="store_true",
                                 default=False)

        # Version
        class PrintVersionAction(argparse.Action):
            def __call__(self, parser, namespace, values, option_string=None):
                print __version__
                sys.exit(0)

        self.parser.add_argument("--version",
                                 nargs=0,
                                 action=PrintVersionAction)

    def parseArgs(self):
        self.args = self.parser.parse_args()

    def start(self):
        self.parseArgs()
        self.validateArgs()
        return self.run()

    def getVersion(self):
        return __version__

    def validateArgs(self):
        if not os.path.exists(self.args.infile):
            self.parser.error('input.cmp.h5 file provided does not exist')

        if self.args.identify and self.args.control:
            self.parser.error(
                '--control and --identify are mutally exclusive. Please choose one or the other'
            )

        if self.args.useLDA:
            if self.args.m5Cclassifier is None:
                self.parser.error(
                    'Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.'
                )

        if self.args.m5Cgff:
            if not self.args.useLDA:
                self.parser.error(
                    'm5Cgff file can only be generated in --useLDA mode.')

        # if self.args.methylFraction and not self.args.identify:
        #    self.parser.error('Currently, --methylFraction only works when the --identify option is specified.')

    def run(self):

        # Figure out what modifications to identify
        mods = self.args.identify
        modsToCall = []
        if mods:
            items = mods.split(",")

            if 'm6A' in items:
                modsToCall.append('H')

            if 'm4C' in items:
                modsToCall.append('J')

            if 'm5C_TET' in items:
                modsToCall.append('K')

            self.args.identify = True
            self.args.modsToCall = modsToCall

        self.options = self.args
        self.options.cmdLine = " ".join(sys.argv)
        self._workers = []

        # Log generously
        stdOutHandler = logging.StreamHandler(sys.stdout)
        logFormat = '%(asctime)s [%(levelname)s] %(message)s'
        if self.args.verbose:
            logging.basicConfig(level=logging.INFO, format=logFormat)
        else:
            logging.basicConfig(level=logging.WARN, format=logFormat)

        if self.args.doProfiling:
            cProfile.runctx("self._mainLoop()",
                            globals=globals(),
                            locals=locals(),
                            filename="profile.out")

        else:
            try:
                ret = self._mainLoop()
            finally:
                # Be sure to shutdown child processes if we get an exception on the main thread
                if not self.args.threaded:
                    for w in self._workers:
                        if w.is_alive():
                            w.terminate()

            return ret

    def _initQueues(self):
        if self.options.threaded:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = Queue.Queue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(
                self.options.maxQueueSize)
        else:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = multiprocessing.JoinableQueue(
                self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(
                self.options.maxQueueSize)

    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus, ))
        logging.info("Requested worker processes: %d" %
                     (self.options.numWorkers, ))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn(
                "More worker processes requested (%d) than CPUs available (%d);"
                " may result in suboptimal performance." %
                (self.options.numWorkers, availableCpus))

        self._initQueues()

        if self.options.threaded:
            self.options.numWorkers = 1
            WorkerType = KineticWorkerThread
        else:
            WorkerType = KineticWorkerProcess

        # Launch the worker processes
        self._workers = []
        for i in xrange(self.options.numWorkers):
            p = WorkerType(self.options, self._workQueue, self._resultsQueue,
                           self.ipdModel)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options,
                                                      self._resultsQueue,
                                                      self.refInfo,
                                                      self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(
            target=monitorChildProcesses,
            args=(self._workers + [self._resultCollectorProcess], ))
        self.monitoringThread.start()

    def _queueChunksForReference(self, refInfo):
        """
        Compute the chunk extents and queue up the work for a single reference
        """

        # Number of hits on current reference
        refGroupId = refInfo.ID
        numHits = (self.cmph5.RefGroupID == refGroupId).sum()

        # Don't process reference groups with 0 hits.  They may not exist?
        if numHits == 0:
            return

        # Maximum chunk size (set no larger than 1Mb for now)
        MAX_BLOCK_SIZE = 25000

        # Maximum number of hits per chunk
        MAX_HITS = 5000
        nBases = min(refInfo.Length, self.args.maxLength)

        # Adjust numHits if we are only doing part of the contig
        numHits = (numHits * nBases) / refInfo.Length

        nBlocks = max([numHits / MAX_HITS, nBases / (MAX_BLOCK_SIZE - 1) + 1])

        # Including nBases / (MAX_BLOCK_SIZE - 1) + 1 in nBlocks calculation:
        # E. coli genome: this should be ~ 10.
        # Human genome: ought to be largest & is meant to ensure that blockSize < MAX_BLOCK_SIZE.

        # Block layout
        blockSize = min(nBases, max(nBases / nBlocks + 1, 1000))
        blockStarts = np.arange(0, nBases, step=blockSize)
        blockEnds = blockStarts + blockSize
        blocks = zip(blockStarts, blockEnds)

        logging.info(
            "Queueing chunks for ref: %d.  NumReads: %d, Block Size: %d " %
            (refGroupId, numHits, blockSize))

        # Queue up work blocks
        for block in blocks:
            # NOTE! The format of a work chunk is (refId <int>, refStartBase <int>, refEndBase <int>)
            chunk = (refInfo.ID, block[0], block[1])
            self._workQueue.put((self.workChunkCounter, chunk))
            self.workChunkCounter += 1

            if self.workChunkCounter % 10 == 0:
                logging.info("Queued chunk: %d.  Chunks in queue: %d" %
                             (self.workChunkCounter, self._workQueue.qsize()))

    def loadReferenceAndModel(self, referencePath, cmpH5Path):

        # Load the reference contigs - annotated with their refID from the cmp.h5
        contigs = ReferenceUtils.loadReferenceContigs(referencePath, cmpH5Path)

        # Read reference info table from cmp.h5
        (refInfoTable, _) = ReferenceUtils.loadCmpH5Tables(cmpH5Path)

        if (self.options.refContigs is not None
                or self.options.refContigIndex != -1):

            if (self.options.refContigs is not None
                    and self.options.refContigIndex != -1):

                requestedIds = set(self.options.refContigs.split(',')).union(
                    [self.options.refContigIndex])

            elif (self.options.refContigs is None
                  and self.options.refContigIndex != -1):

                requestedIds = set([self.options.refContigIndex])

            elif (self.options.refContigs is not None
                  and self.options.refContigIndex == -1):

                requestedIds = set(self.options.refContigs.split(','))

            relevantContigs = [
                i for (i, rec) in enumerate(refInfoTable)
                if (rec.FullName in requestedIds or rec.Name in requestedIds
                    or rec.RefInfoID in requestedIds)
            ]
            self.refInfo = refInfoTable[relevantContigs]

        else:
            self.refInfo = refInfoTable

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" %
                              self.args.ipdModel)
                sys.exit(1)
        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" %
                              self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadCmpH5Chemistry(cmpH5Path)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
            if majorityChem == 'unknown':
                logging.error(
                    "Chemistry cannot be identified---cannot perform kinetic analysis"
                )
                sys.exit(1)
            elif not os.path.exists(ipdModel):
                logging.error(
                    "Aborting, no kinetics model available for this chemistry: %s"
                    % ipdModel)
                sys.exit(1)
            else:
                logging.info("Using Chemistry matched IPD model: %s" %
                             ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)

    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        gc.disable()

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference, self.args.infile)

        # Spawn workers
        self._launchSlaveProcesses()

        # WARNING -- cmp.h5 file must be opened AFTER worker processes have been spawned
        # cmp.h5 we're using -- use this to orchestrate the work
        self.cmph5 = CmpH5Reader(self.args.infile)
        logging.info('Generating kinetics summary for [%s]' % self.args.infile)

        #self.referenceMap = self.cmph5['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.cmph5['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for ref in self.refInfo:
            logging.info('Processing reference entry: [%s]' % ref.ID)
            self._queueChunksForReference(ref)

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        del self.cmph5
        return 0
Пример #7
0
class KineticsToolsRunner(object):

    def __init__(self):
        desc = ['Tool for detecting DNA base-modifications from kinetic signatures',
                'Notes: For all command-line arguments, default values are listed in [].']
        description = '\n'.join(desc)

        self.parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                              description=description,
                                              version=__version__)


        # Positional arguments:


        self.parser.add_argument('infile',
                                 metavar='input.cmp.h5',
                                 type=validateFile,
                                 help='Input cmp.h5 filename')

        # Optional arguments:

        # Output options:

        # Note: reference is actually not optional:

        self.parser.add_argument('--reference', 
                                 type=validateFile,
                                 help='Path to reference FASTA file')


        self.parser.add_argument('--outfile',
                                 dest='outfile',
                                 default=None,
                                 help='Use this option to generate all possible output files. Argument here is the root filename of the output files.')

        self.parser.add_argument('--gff',
                                 dest='gff',
                                 default=None,
                                 help='Name of output GFF file')

        # FIXME: Need to add an extra check for this; it can only be used if --useLDA flag is set.
        self.parser.add_argument('--m5Cgff',
                                 dest='m5Cgff',
                                 default=None,
                                 help='Name of output GFF file containing m5C scores')

        # FIXME: Make sure that this is specified if --useLDA flag is set.
        self.parser.add_argument('--m5Cclassifer',
                                 dest='m5Cclassifier',
                                 default=None,
                                 help='Specify csv file containing a 127 x 2 matrix')

        self.parser.add_argument('--csv',
                                 dest='csv',
                                 default=None,
                                 help='Name of output CSV file.')


        self.parser.add_argument('--csv_h5',
                                 dest='csv_h5',
                                 default=None,
                                 help='Name of csv output to be written in hdf5 format.')

        self.parser.add_argument('--pickle',
                                 dest='pickle',
                                 default=None,
                                 help='Name of output pickle file.')

        self.parser.add_argument('--summary_h5',
                                 dest='summary_h5',
                                 default=None,
                                 help='Name of output summary h5 file.')


        self.parser.add_argument('--ms_csv',
                                 dest='ms_csv',
                                 default=None,
                                 help='Multisite detection CSV file.')


        # Calculation options:


        self.parser.add_argument('--control',
                                 dest='control',
                                 default=None,
                                 type=validateNoneOrFile,
                                 help='cmph.h5 file containing a control sample. Tool will perform a case-control analysis')

        self.parser.add_argument('--identify',
                                 dest='identify',
                                 default=False,
                                 help='Identify modification types. Comma-separated list of know modification types. Current options are: m6A, m4C, m5C_TET. Cannot be used with --control')


        self.parser.add_argument("--methylFraction",
                                 action="store_true",
                                 dest="methylFraction",
                                 default=False,
                                 help="In the --identify mode, add --methylFraction to command line to estimate the methylated fraction, along with 95%% confidence interval bounds.")

        # Temporary addition to test LDA for Ca5C detection:
        self.parser.add_argument('--useLDA',
                                 action="store_true",
                                 dest='useLDA',
                                 default=False,
                                 help='Set this flag to debug LDA for m5C/Ca5C detection')



        # Parameter options:

        self.parser.add_argument('--paramsPath',
                                 dest='paramsPath',
                                 default=None,
                                 type=validateNoneOrDir,
                                 help='Directory containing in-silico trained model for each chemistry')

        self.parser.add_argument("--maxLength",
                                 default=3e12,
                                 type=int,
                                 help="Maximum number of bases to process per contig")

        self.parser.add_argument('--minCoverage',
                                 dest='minCoverage',
                                 default=3,
                                 type=int,
                                 help='Minimum coverage required to call a modified base')

        self.parser.add_argument('--maxQueueSize',
                                 dest='maxQueueSize',
                                 default=20,
                                 type=int,
                                 help='Max Queue Size')

        self.parser.add_argument('--maxCoverage',
                                 dest='maxCoverage',
                                 type=int, default=-1,
                                 help='Maximum coverage to use at each site')

        self.parser.add_argument('--mapQvThreshold',
                                 dest='mapQvThreshold',
                                 type=float,
                                 default=-1.0)

        self.parser.add_argument('--pvalue',
                                 dest='pvalue',
                                 default=0.01,
                                 type=float,
                                 help='p-value required to call a modified base')

        self.parser.add_argument('--ipdModel',
                                 dest='ipdModel',
                                 default=None,
                                 help='Alternate synthetic IPD model HDF5 file')

        self.parser.add_argument('--modelIters',
                                 dest='modelIters',
                                 type=int,
                                 default=-1,
                                 help='[Internal] Number of GBM model iteration to use')

        self.parser.add_argument('--cap_percentile',
                                 dest='cap_percentile',
                                 type=float,
                                 default=99.0,
                                 help='Global IPD percentile to cap IPDs at')


        self.parser.add_argument("--methylMinCov",
                                 type=int,
                                 dest='methylMinCov',
                                 default=10,
                                 help="Do not try to estimate methylFraction unless coverage is at least this.")

        self.parser.add_argument("--identifyMinCov",
                                 type=int,
                                 dest='identifyMinCov',
                                 default=5,
                                 help="Do not try to identify the modification type unless coverage is at least this.")


        # Computation management options:

        self.parser.add_argument("--refContigs", "-w",
                                 type=str,
                                 dest='refContigs',
                                 default=None,
                                 help="Specify one reference contig name, or multiple comma-separated " \
                                      "contig names, to be processed.  By default, processes all "      \
                                      "contigs with mapped coverage.")

        def slurpWindowFile(fname):
            return ",".join(map(str.strip, open(fname).readlines()))


        self.parser.add_argument("--refContigIndex", type=int, dest='refContigIndex', default=-1, help="For debugging purposes only - rather than enter a reference contig name, simply enter an index" ) 

        self.parser.add_argument("--refContigsFile", "-W",
                                 type=slurpWindowFile,
                                 dest='refContigs',
                                 default=None,
                                 help="A file containing contig names, one per line")

        self.parser.add_argument('--numWorkers',
                                 dest='numWorkers',
                                 default=-1,  # Defaults to using all logical CPUs
                                 type=int,
                                 help='Number of thread to use (-1 uses all logical cpus)')

        # Debugging help options:

        self.parser.add_argument("--threaded", "-T",
                                 action="store_true",
                                 dest="threaded",
                                 default=False,
                                 help="Run threads instead of processes (for debugging purposes only)")

        self.parser.add_argument("--profile",
                                 action="store_true",
                                 dest="doProfiling",
                                 default=False,
                                 help="Enable Python-level profiling (using cProfile).")

        self.parser.add_argument('--pdb',
                                 action='store_true',
                                 dest="usePdb",
                                 default=False,
                                 help="Enable dropping down into pdb debugger if an Exception is raised.")




    def parseArgs(self):
        self.args = self.parser.parse_args()

    def start(self):
        self.parseArgs()
        self.validateArgs()
        return self.run()

    def getVersion(self):
        return __version__

    def validateArgs(self):
        if not os.path.exists(self.args.infile):
            self.parser.error('input.cmp.h5 file provided does not exist')

        if self.args.identify and self.args.control:
            self.parser.error('--control and --identify are mutally exclusive. Please choose one or the other')

        if self.args.useLDA:
            if self.args.m5Cclassifier is None:
                self.parser.error('Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.')

        if self.args.m5Cgff:
            if not self.args.useLDA:
                self.parser.error('m5Cgff file can only be generated in --useLDA mode.')

        # if self.args.methylFraction and not self.args.identify:
        #    self.parser.error('Currently, --methylFraction only works when the --identify option is specified.')

    def run(self):

        # Figure out what modifications to identify
        mods = self.args.identify
        modsToCall = []
        if mods:
            items = mods.split(",")

            if 'm6A' in items:
                modsToCall.append('H')

            if 'm4C' in items:
                modsToCall.append('J')

            if 'm5C_TET' in items:
                modsToCall.append('K')

            self.args.identify = True
            self.args.modsToCall = modsToCall

        self.options = self.args
        self.options.cmdLine = " ".join(sys.argv)
        self._workers = []

        # Log generously
        stdOutHandler = logging.StreamHandler(sys.stdout)
        logFormat = '%(asctime)s [%(levelname)s] %(message)s'
        logging.basicConfig(level=logging.INFO, format=logFormat)

        if self.args.doProfiling:
            cProfile.runctx("self._mainLoop()",
                            globals=globals(),
                            locals=locals(),
                            filename="profile.out")

        else:
            try:
                ret = self._mainLoop()
            finally:
                # Be sure to shutdown child processes if we get an exception on the main thread
                if not self.args.threaded:
                    for w in self._workers:
                        if w.is_alive():
                            w.terminate()

            return ret

    def _initQueues(self):
        if self.options.threaded:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = Queue.Queue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)
        else:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)

    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus,))
        logging.info("Requested worker processes: %d" % (self.options.numWorkers,))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn("More worker processes requested (%d) than CPUs available (%d);"
                         " may result in suboptimal performance."
                         % (self.options.numWorkers, availableCpus))

        self._initQueues()

        if self.options.threaded:
            self.options.numWorkers = 1
            WorkerType = KineticWorkerThread
        else:
            WorkerType = KineticWorkerProcess

        # Launch the worker processes
        self._workers = []
        for i in xrange(self.options.numWorkers):
            p = WorkerType(self.options, self._workQueue, self._resultsQueue, self.ipdModel)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options, self._resultsQueue, self.refInfo, self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(target=monitorChildProcesses, args=(self._workers + [self._resultCollectorProcess],))
        self.monitoringThread.start()

    def _queueChunksForReference(self, refInfo):
        """
        Compute the chunk extents and queue up the work for a single reference
        """

        # Number of hits on current reference
        refGroupId = refInfo.ID
        numHits = (self.cmph5.RefGroupID == refGroupId).sum()

        # Don't process reference groups with 0 hits.  They may not exist?
        if numHits == 0:
            return

        # Maximum chunk size (set no larger than 1Mb for now)
        MAX_BLOCK_SIZE = 25000

        # Maximum number of hits per chunk
        MAX_HITS = 5000
        nBases = min(refInfo.Length, self.args.maxLength)

        # Adjust numHits if we are only doing part of the contig
        numHits = (numHits * nBases) / refInfo.Length

        nBlocks = max([numHits / MAX_HITS, nBases / (MAX_BLOCK_SIZE - 1) + 1])

        # Including nBases / (MAX_BLOCK_SIZE - 1) + 1 in nBlocks calculation:
        # E. coli genome: this should be ~ 10.
        # Human genome: ought to be largest & is meant to ensure that blockSize < MAX_BLOCK_SIZE.

        # Block layout
        blockSize = min(nBases, max(nBases / nBlocks + 1, 1000))
        blockStarts = np.arange(0, nBases, step=blockSize)
        blockEnds = blockStarts + blockSize
        blocks = zip(blockStarts, blockEnds)

        logging.info("Queueing chunks for ref: %d.  NumReads: %d, Block Size: %d " % (refGroupId, numHits, blockSize))

        # Queue up work blocks
        for block in blocks:
            # NOTE! The format of a work chunk is (refId <int>, refStartBase <int>, refEndBase <int>)
            chunk = (refInfo.ID, block[0], block[1])
            self._workQueue.put((self.workChunkCounter, chunk))
            self.workChunkCounter += 1

            if self.workChunkCounter % 10 == 0:
                logging.info("Queued chunk: %d.  Chunks in queue: %d" % (self.workChunkCounter, self._workQueue.qsize()))

    def loadReferenceAndModel(self, referencePath, cmpH5Path):

        # Load the reference contigs - annotated with their refID from the cmp.h5
        contigs = ReferenceUtils.loadReferenceContigs(referencePath, cmpH5Path)

        # Read reference info table from cmp.h5
        (refInfoTable, _) = ReferenceUtils.loadCmpH5Tables(cmpH5Path)

        if (self.options.refContigs is not None or
            self.options.refContigIndex != -1):

            if (self.options.refContigs is not None and 
                self.options.refContigIndex != -1):

                requestedIds = set(self.options.refContigs.split(',')).union([self.options.refContigIndex])

            elif (self.options.refContigs is None and 
                self.options.refContigIndex != -1):
       
                requestedIds = set([self.options.refContigIndex])

            elif (self.options.refContigs is not None and 
                self.options.refContigIndex == -1):
       
                requestedIds = set(self.options.refContigs.split(','))
      

            relevantContigs = [ i for (i, rec) in enumerate(refInfoTable)
                                if (rec.FullName  in requestedIds or
                                    rec.Name      in requestedIds or
                                    rec.RefInfoID in requestedIds) ]
            self.refInfo = refInfoTable[relevantContigs]


        else:
            self.refInfo = refInfoTable

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" % self.args.ipdModel)
                sys.exit(1)
        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadCmpH5Chemistry(cmpH5Path)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
            if majorityChem == 'unknown':
                logging.warning("Chemistry is unknown. Falling back to built-in model")
                ipdModel = None
            elif not os.path.exists(ipdModel):
                logging.warning("Model not found: %s" % ipdModel)
                logging.warning("Falling back to built-in model")
                ipdModel = None
            else:
                logging.info("Using Chemistry matched IPD model: %s" % ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)

    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        gc.disable()

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference, self.args.infile)

        # Spawn workers
        self._launchSlaveProcesses()

        # WARNING -- cmp.h5 file must be opened AFTER worker processes have been spawned
        # cmp.h5 we're using -- use this to orchestrate the work
        self.cmph5 = CmpH5Reader(self.args.infile)
        logging.info('Generating kinetics summary for [%s]' % self.args.infile)

        #self.referenceMap = self.cmph5['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.cmph5['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for ref in self.refInfo:
            logging.info('Processing reference entry: [%s]' % ref.ID)
            self._queueChunksForReference(ref)

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        del self.cmph5
        return 0
class ReprocessMotifSites(PBToolRunner):
    def __init__(self):
        desc = [
            "For all sites in motifs.gff, reports estimated methylated fraction and 95% confidence interval",
            "Notes: For all command-line arguments, default values are listed in [].",
        ]
        super(ReprocessMotifSites, self).__init__("\n".join(desc))

        self.parser.add_argument(
            "--numWorkers",
            dest="numWorkers",
            default=-1,  # Defaults to using all logical CPUs
            type=int,
            help="Number of thread to use (-1 uses all logical cpus)",
        )

        self.parser.add_argument("infile", metavar="input.cmp.h5", help="Input cmp.h5 filename")

        # self.parser.add_argument('--control',
        #     dest='control',
        #     default=None,
        #     help='cmph.h5 file ')

        # self.parser.add_argument('--outfile',
        #     dest='outfile',
        #     default=None,
        #    help='Use this option to generate all possible output files. Argument here is the root filename of the output files.')

        self.parser.add_argument("--gff", dest="gff", default=None, help="Name of output GFF file [%(default)s]")

        # self.parser.add_argument('--identify',
        #     dest='identify',
        #     default=False,
        #     help='Identify modification types. Comma-separated list of know modification types. Current options are: m6A, m4C, m5C_TET. Cannot be used with --control')

        # self.parser.add_argument('--csv',
        #     dest='csv',
        #     default=None,
        #     help='Name of output CSV file [%(default)s]')

        # self.parser.add_argument('--pickle',
        #     dest='pickle',
        #     default=None,
        #     help='Name of output pickle file [%(default)s]')

        # self.parser.add_argument('--summary_h5',
        #     dest='summary_h5',
        #     default=None,
        #     help='Name of output summary h5 file [%(default)s]')

        self.parser.add_argument("--reference", dest="reference", required=True, help="Path to reference FASTA file")

        self.parser.add_argument(
            "--maxLength", default=3e12, type=int, help="Maximum number of bases to process per contig"
        )

        self.parser.add_argument(
            "--minCoverage",
            dest="minCoverage",
            default=3,
            type=int,
            help="Minimum coverage required to call a modified base",
        )

        self.parser.add_argument("--maxQueueSize", dest="maxQueueSize", default=1000, type=int, help="Max Queue Size")

        self.parser.add_argument(
            "--maxCoverage", dest="maxCoverage", type=int, default=None, help="Maximum coverage to use at each site"
        )

        self.parser.add_argument("--mapQvThreshold", dest="mapQvThreshold", type=float, default=-1.0)

        # self.parser.add_argument('--pvalue',
        #     dest='pvalue',
        #     default=0.01,
        #     type=float,
        #     help='p-value required to call a modified base')

        self.parser.add_argument(
            "--subread_norm",
            dest="subread_norm",
            default=True,
            type=lambda x: x != "False",
            help="Normalized subread ipds",
        )

        self.parser.add_argument(
            "--ipdModel", dest="ipdModel", default=None, help="Alternate synthetic IPD model HDF5 file"
        )

        self.parser.add_argument(
            "--cap_percentile",
            dest="cap_percentile",
            type=float,
            default=99.0,
            help="Global IPD percentile to cap IPDs at",
        )

        self.parser.add_argument(
            "--threaded",
            "-T",
            action="store_true",
            dest="threaded",
            default=False,
            help="Run threads instead of processes (for debugging purposes only)",
        )

        self.parser.add_argument(
            "--profile",
            action="store_true",
            dest="doProfiling",
            default=False,
            help="Enable Python-level profiling (using cProfile).",
        )

        # self.parser.add_argument("--methylFraction",
        #     action="store_true",
        #     dest="methylFraction",
        #     default=False,
        #     help="In the --identify mode, add --methylFraction to command line to estimate the methylated fraction, along with 95% confidence interval bounds.")

        # The following are in addition to ipdSummary.py's inputs:
        self.parser.add_argument("--motifs", dest="motifs", required=True, help="Name of motifs GFF file [%(default)s]")

        self.parser.add_argument(
            "--motif_summary", dest="motif_summary", required=True, help="Name of motif summary CSV file"
        )

        self.parser.add_argument(
            "--undetected",
            action="store_true",
            dest="undetected",
            default=False,
            help="Setting this flag yields output with only undetected motif sites.",
        )

        self.parser.add_argument(
            "--modifications", dest="modifications", default=None, help="Name of modifications GFF file [%(default)s]"
        )

        self.parser.add_argument(
            "--oldData",
            action="store_true",
            dest="oldData",
            default=False,
            help="For datasets prior to 1.3.3 (use this option to increase testing possibilities)",
        )

        # A new addition to ipdSummary.py

        self.parser.add_argument(
            "--paramsPath",
            dest="paramsPath",
            default=None,
            help="Directory containing in-silico trained model for each chemistry",
        )

        self.parser.add_argument(
            "--modelIters",
            dest="modelIters",
            type=int,
            default=-1,
            help="[Internal] Number of GBM model iteration to use",
        )

    def getVersion(self):
        return __version__

    def validateArgs(self):
        if not os.path.exists(self.args.infile):
            self.parser.error("input.cmp.h5 file provided does not exist")

        # Add checks corresponding to new required inputs:
        if not os.path.exists(self.args.motifs):
            self.parser.error("input motifs gff file provided does not exist")

        if not os.path.exists(self.args.motif_summary):
            self.parser.error("input motif_summary csv file provided does not exist")

        if not self.args.undetected and not os.path.exists(self.args.modifications):
            self.parser.error("either the --undetected flag must be set, or a valid modifications.gff must be provided")

    def run(self):

        # The following arguments are set in order to use ResultWriter.py as is:
        self.args.methylFraction = True
        self.args.outfile = None
        self.args.csv = None
        self.args.control = None
        self.args.summary_h5 = None
        self.args.pickle = None
        self.args.identify = False
        self.args.pvalue = 1.0

        self.options = self.args
        self.options.cmdLine = " ".join(sys.argv)
        self._workers = []

        # Log generously
        logFormat = "%(asctime)s [%(levelname)s] %(message)s"
        logging.basicConfig(level=logging.INFO, format=logFormat)
        stdOutHandler = logging.StreamHandler(sys.stdout)
        # logging.Logger.root.addHandler(stdOutHandler)
        # logging.info("t1")

        if self.args.doProfiling:
            cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename="profile-main4.out")

        else:
            try:
                ret = self._mainLoop()
            finally:
                # Be sure to shutdown child processes if we get an exception on the main thread
                if not self.args.threaded:
                    for w in self._workers:
                        if w.is_alive():
                            w.terminate()

            return ret

    def _initQueues(self):
        if self.options.threaded:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = Queue.Queue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)
        else:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)

    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus,))
        logging.info("Requested worker processes: %d" % (self.options.numWorkers,))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn(
                "More worker processes requested (%d) than CPUs available (%d);"
                " may result in suboptimal performance." % (self.options.numWorkers, availableCpus)
            )

        self._initQueues()

        if self.options.threaded:
            self.options.numWorkers = 1
            WorkerType = KineticWorkerThread
        else:
            WorkerType = KineticWorkerProcess

        # Launch the worker processes
        self._workers = []
        for i in xrange(self.options.numWorkers):
            p = WorkerType(self.options, self._workQueue, self._resultsQueue, self.ipdModel)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options, self._resultsQueue, self.refInfo, self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(
            target=monitorChildProcesses, args=(self._workers + [self._resultCollectorProcess],)
        )
        self.monitoringThread.start()

    def _queueChunksForReference(self):

        # Read in motif_summary.csv
        motifInfo = {}
        reader = csv.reader(open(self.args.motif_summary, "r"), delimiter=",")
        reader.next()
        if self.options.oldData:
            col = 1
        else:
            col = 2
        for row in reader:
            motifInfo[row[0]] = row[col]

        # Figure out the length of the motifs file:
        motReader = GffReader(self.args.motifs)
        if self.options.undetected:
            motifDicts = [
                {
                    "seqID": x.seqid,
                    "type": x.type,
                    "score": x.score,
                    "pos": x.start,
                    "strand": x.strand,
                    "attributes": x.attributes,
                }
                for x in motReader
                if x.type == "."
            ]
        else:
            motifDicts = [
                {
                    "seqID": x.seqid,
                    "type": x.type,
                    "score": x.score,
                    "pos": x.start,
                    "strand": x.strand,
                    "attributes": x.attributes,
                }
                for x in motReader
            ]

        refLength = len(motifDicts)

        # Maximum number of hits per chunk
        MAX_HITS = 500
        nBases = min(refLength, self.args.maxLength)
        nBlocks = max(self.options.numWorkers * 4, nBases / MAX_HITS)

        # Block layout
        blockSize = min(nBases, max(nBases / nBlocks + 1, 100))
        blockStarts = np.arange(0, nBases, step=blockSize)
        blockEnds = blockStarts + blockSize
        blocks = zip(blockStarts, blockEnds)

        if self.options.undetected:
            self.options.modifications = None

        # Queue up work blocks
        for block in blocks:
            # NOTE! The format of a work chunk is (refId <int>, refStartBase <int>, refEndBase <int>)
            # chunk = (refInfoId, block[0], block[1])
            # chunk = (self.options.motifs, self.refInfo, motifInfo, self.options.modifications, self.options.undetected, self.options.oldData, block[0], block[1])
            chunk = (
                motifDicts[block[0] : block[1]],
                self.refInfo,
                motifInfo,
                self.options.modifications,
                self.options.undetected,
                self.options.oldData,
                block[0],
                block[1],
            )
            self._workQueue.put((self.workChunkCounter, chunk))
            self.workChunkCounter += 1

    def loadReference(self):
        # FIXME - support a bare fasta file as well?
        self.referenceEntry = ReferenceEntry(self.args.reference)
        self.refInfo = self.referenceEntry.contigs

        if self.args.ipdModel:
            self.lutPath = self.args.ipdModel
            if not os.path.exists(self.lutPath):
                logging.info("Couldn't find model file: %s" % self.lutPath)
                raise Exception("Couldn't find model file: %s" % self.lutPath)
        else:
            self.lutPath = None

        self.ipdModel = IpdModel(self.referenceEntry, self.lutPath)

    def loadReferenceAndModel(self, referencePath, cmpH5Path):

        # Load the reference contigs - annotated with their refID from the cmp.h5
        contigs = ReferenceUtils.loadReferenceContigs(referencePath, cmpH5Path)

        # Read reference info table from cmp.h5
        (refInfoTable, _) = ReferenceUtils.loadCmpH5Tables(cmpH5Path)
        self.refInfo = refInfoTable

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's chemistry info
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" % self.args.ipdModel)

        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadCmpH5Chemistry(cmpH5Path)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")

            if majorityChem == "unknown":
                logging.warning("Chemistry is unknown. Falling back to built-in model")
                ipdModel = None
            elif not os.path.exists(ipdModel):
                logging.warning("Model not found: %s" % ipdModel)
                logging.warning("Falling back to built-in model")
                ipdModel = None
            else:
                logging.info("Using Chemistry matched IPD model: %s" % ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)

    def _mainLoop(self):

        # See comments in ipdSummary.py
        gc.disable()

        # Load reference and IpdModel
        # self.loadReference()

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference, self.args.infile)

        # Spawn workers
        self._launchSlaveProcesses()

        # cmp.h5 we're using -- use this to orchestrate the work
        self.cmph5 = CmpH5Reader(self.args.infile)
        logging.info("Generating kinetics summary for [%s]" % self.args.infile)

        self.workChunkCounter = 0
        self._queueChunksForReference()

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("reprocessMotifSites.py finished. Exiting.")
        del self.cmph5
        return 0